In [5]:
# Standard library imports
import os
import re
import json
from collections import Counter, defaultdict
from itertools import combinations

# Third-party library imports
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

# NLTK imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import (
    RegexpTokenizer,
    TextTilingTokenizer,
    sent_tokenize,
    word_tokenize,
)
nltk.download("punkt")
nltk.download("punkt_tab")

# Transformers library imports
from tqdm import tqdm
from transformers import (
    pipeline,
    BertTokenizer,
    BertModel,
    AutoTokenizer,
    AutoModel,
    AutoModelForSeq2SeqLM,
    AutoModelForQuestionAnswering,
    BertForQuestionAnswering,
    Trainer,
    TrainingArguments,
)

# Sentence Transformers library imports
from sentence_transformers import SentenceTransformer

# SpaCy model load
import spacy
nlp = spacy.load("en_core_web_sm")

# Environment settings
os.environ["WANDB_DISABLED"] = "true"

[nltk_data] Downloading package punkt to /home/yejashi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/yejashi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
contractions_dict = {
      "ain’t": "are not", "aren’t": "are not", "can’t": "cannot", "can’t’ve": "cannot have",
      "'cause": "because", "could’ve": "could have", "couldn’t": "could not", "couldn’t’ve": "could not have",
      "didn’t": "did not", "doesn’t": "does not", "don’t": "do not", "hadn’t": "had not",
      "hadn’t’ve": "had not have", "hasn’t": "has not", "haven’t": "have not", "he’d": "he would",
      "he’d’ve": "he would have", "he’ll": "he will", "he’ll’ve": "he will have", "how’d": "how did",
      "how’d’y": "how do you", "how’ll": "how will", "I’d": "I would", "I’d’ve": "I would have",
      "I’ll": "I will", "I’ll’ve": "I will have", "I’m": "I am", "I’ve": "I have", "isn’t": "is not",
      "it’d": "it would", "it’d’ve": "it would have", "it’ll": "it will", "it’ll’ve": "it will have",
      "let’s": "let us", "ma’am": "madam", "mayn’t": "may not", "might’ve": "might have",
      "mightn’t": "might not", "mightn’t’ve": "might not have", "must’ve": "must have", "mustn’t": "must not",
      "mustn’t’ve": "must not have", "needn’t": "need not", "needn’t’ve": "need not have", "o’clock": "of the clock",
      "oughtn’t": "ought not", "oughtn’t’ve": "ought not have", "shan’t": "shall not", "sha’n’t": "shall not",
      "shan’t’ve": "shall not have", "she’d": "she would", "she’d’ve": "she would have", "she’ll": "she will",
      "she’ll’ve": "she will have", "should’ve": "should have", "shouldn’t": "should not",
      "shouldn’t’ve": "should not have", "so’ve": "so have", "that’d": "that would", "that’d’ve": "that would have",
      "there’d": "there would", "there’d’ve": "there would have", "they’d": "they would",
      "they’d’ve": "they would have", "they’ll": "they will", "they’ll’ve": "they will have", "they’re": "they are",
      "they’ve": "they have", "to’ve": "to have", "wasn’t": "was not", "we’d": "we would",
      "we’d’ve": "we would have", "we’ll": "we will", "we’ll’ve": "we will have", "we’re": "we are",
      "we’ve": "we have", "weren’t": "were not", "what’ll": "what will", "what’ll’ve": "what will have",
      "what’re": "what are", "what’ve": "what have", "when’ve": "when have", "where’d": "where did",
      "where’ve": "where have", "who’ll": "who will", "who’ll’ve": "who will have", "who’ve": "who have",
      "why’ve": "why have", "will’ve": "will have", "won’t": "will not", "won’t’ve": "will not have",
      "would’ve": "would have", "wouldn’t": "would not", "wouldn’t’ve": "would not have", "y’all": "you all",
      "y’all’d": "you all would", "y’all’d’ve": "you all would have", "y’all’re": "you all are",
      "y’all’ve": "you all have", "you’d": "you would", "you’d’ve": "you would have", "you’ll": "you will",
      "you’ll’ve": "you will have", "you’re": "you are", "you’ve": "you have"
  }

In [7]:
def load_text(file_path):
    """
    Load text from a file.
    """
    print("Loading the text from the file...")
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [8]:
def remove_punctuation(text):
    """
    Remove punctuation from a given text string.

    This function takes a string and removes any character that is not a
    word character (alphanumeric) or whitespace, effectively stripping out
    punctuation.

    Parameters:
        text (str): The input string from which to remove punctuation.

    Returns:
        str: The input string with punctuation removed.
    """
    return re.sub(r'[^\w\s]', '', text)

In [9]:
def expand_contractions(s, contractions_dict, cont_reg):
    """
    Expand contractions in a given text string using a dictionary of contractions.

    This function searches for contractions in the input string `s` using a regular
    expression pattern (`cont_reg`) and replaces each contraction with its expanded
    form from the `contractions_dict`.

    Parameters:
        s (str): The input string containing contractions to expand.
        contractions_dict (dict): A dictionary where keys are contractions (e.g., "can't")
                              and values are their expanded forms (e.g., "cannot").
        cont_reg (re.Pattern): A regular expression pattern to identify
                           contractions within the text.

    Returns:
        str: The input string with contractions expanded.

    """
    def replace(match):
        contraction = match.group(0).lower()
        if contraction in contractions_dict:
            return contractions_dict[contraction]
        return contraction  # Returns original if not in dictionary

    return cont_reg.sub(replace, s)

In [10]:
def remove_gutenberg_header_footer(text, start_marker, end_marker):
    """
    Remove the header and footer from Project Gutenberg text.

    Finds specified start and end markers and returns the content in between.

    Parameters:
        text (str): The text to clean.
        start_marker (str): Marker indicating the start of main content.
        end_marker (str): Marker indicating the end of main content.

    Returns:
        str: Text with header and footer removed, or original text if markers are not found.
    """
    start_index = text.find(start_marker)
    if start_index == -1:
        return text

    end_index = text.find(end_marker)
    if end_index == -1:
        return text

    cleaned_text = text[start_index + len(start_marker):end_index].strip()
    return cleaned_text


In [None]:
def identify_chapters(text):
    """
    Identify chapter markers in text and filter.

    Finds chapter markers (e.g., "Chapter 1", "Chapter II") and returns their positions.
    Filters out chapters if they are located too closely to one another. (Table of contents)

    Parameters:
        text (str): The text in which to identify chapter markers.

    Returns:
        list of tuples: List of chapter markers and their positions, filtered by distance.
    """
    chapter_markers = []
    roman_numeral_pattern = r"(?i)\bchapter\s+(\d+|[IVXLCDM]+)\b"

    for match in re.finditer(roman_numeral_pattern, text):
        chapter_markers.append((match.group(0), match.start()))

    filtered_chapter_markers = []
    for i in range(len(chapter_markers)):
        if i + 1 < len(chapter_markers):
            current_start_index = chapter_markers[i][1]
            next_start_index = chapter_markers[i + 1][1]
            if next_start_index - current_start_index >= 1000:
                filtered_chapter_markers.append(chapter_markers[i])
        else:
            filtered_chapter_markers.append(chapter_markers[i])

    return filtered_chapter_markers


In [12]:
def split_novel_into_chapters(text, chapter_markers):
    """
    Split text into chapters based on chapter marker positions.

    Parameters:
        text (str): The novel text to split.
        chapter_markers (list of tuples): Chapter markers with positions in the text.

    Returns:
        list of str: List of chapter texts.
    """
    chapters = []

    for i in range(len(chapter_markers)):
        chapter_num, chapter_start_index = chapter_markers[i]

        if i + 1 < len(chapter_markers):
            next_chapter_start_index = chapter_markers[i + 1][1]
            chapter_text = text[chapter_start_index:next_chapter_start_index].strip()
        else:
            chapter_text = text[chapter_start_index:].strip()

        if chapter_text:
            chapters.append(chapter_text)

    return chapters


In [13]:
def combine_chapters_remove_chapter_numbers(chapters):
    """
    Combine chapter texts into one and remove chapter numbers.

    Parameters:
        chapters (list of str): List of chapter texts.

    Returns:
        str: Combined text with chapter numbers removed.
    """
    combined_text = ""
    for chapter in chapters:
        chapter = re.sub(r"Chapter\s+([IVXLC]+|\d+)\b\.?\s?.*?\n", "", chapter, flags=re.IGNORECASE)
        combined_text += chapter + "\n"  # Add a newline between chapters

    return combined_text

In [None]:
def extract_text(file_name, start_marker, end_marker, contractions_dict):
    """
    Process a novel text file to extract chapters and combined text with expanded contractions.

    Parameters:
        file_name (str): Path to the novel text file.
        start_marker (str): Marker indicating the start of the main content to extract.
        end_marker (str): Marker indicating the end of the main content to extract.
        contractions_dict (dict): Dictionary of contractions and their expanded forms,
                                    where keys are contractions and values are their expansions.

    Returns:
        tuple:
            - list of str: List of chapter texts with expanded contractions.
            - str: Combined text with chapter numbers removed and contractions expanded.
    """
    novel_text = load_text(file_name)

    novel_text_no_hf = remove_gutenberg_header_footer(novel_text, start_marker, end_marker)
    chapter_markers = identify_chapters(novel_text_no_hf)
    chapters = split_novel_into_chapters(novel_text_no_hf, chapter_markers)

    contractions_dict = {k.lower(): v for k, v in contractions_dict.items()}
    contraction_pattern = r"\b(%s)\b" % '|'.join(map(re.escape, contractions_dict.keys()))
    cont_reg = re.compile(contraction_pattern, re.IGNORECASE)

    chapters = [expand_contractions(chapter, contractions_dict, cont_reg) for chapter in chapters]

    single_text = combine_chapters_remove_chapter_numbers(chapters)

    return chapters, single_text

In [None]:
def normalize_sentence(sentence: str) -> str:

    sentence = re.sub(r'[^\w\s.!?]', '', sentence).strip()

    return sentence

def tokenize_and_normalize(text: str):
    sentences = nltk.sent_tokenize(text)

    normalized_sentences = [normalize_sentence(sentence) for sentence in sentences]

    return normalized_sentences

In [None]:
file_name = 'data/books/a_study_in_scarlet.txt'

start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***"

chapters, single_text = extract_text(file_name, start_marker, end_marker, contractions_dict)

normalized_sentences = tokenize_and_normalize(single_text)

normalized_sentences_string = " ".join(normalized_sentences)

Loading the text from the file...


Tokenize text at sentence level for BERT tokenizer.  This is generic and will run for any relevant path so do it first.


Recursive Chunking Implementation

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

MAX_TOKENS = 128

def recursive_chunk_with_overlap(text, max_tokens=128, overlap=50):
    """
    Chunk text with a specified overlap between chunks.

    Args:
        text (str): The input text to chunk.
        max_tokens (int): Maximum number of tokens allowed per chunk.
        overlap (int): Number of overlapping tokens between chunks.

    Returns:
        list[str]: A list of overlapping text chunks.
    """
    tokens = tokenizer.tokenize(text)

    if len(tokens) <= max_tokens:
        return [text]

    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk = tokenizer.convert_tokens_to_string(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap

    return chunks

text_chunks = recursive_chunk_with_overlap(normalized_sentences_string, max_tokens=128, overlap=50)

print(f"Number of chunks: {len(text_chunks)}")
print(f"Sample chunk: {text_chunks[0]}")

Token indices sequence length is longer than the specified maximum sequence length for this model (49859 > 512). Running this sequence through the model will result in indexing errors


Number of chunks: 640
Sample chunk: in the year 1878 i took my degree of doctor of medicine of the university of london and proceeded to netley to go through the course prescribed for surgeons in the army. having completed my studies there i was duly attached to the fifth northumberland fusiliers as assistant surgeon. the regiment was stationed in india at the time and before i could join it the second afghan war had broken out. on landing at bombay i learned that my corps had advanced through the passes and was already deep in the enemys country. i followed however with many other officers who were in the same situation as myself and succeeded in reaching candahar in safety where i found my regiment


Phrase chunking

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

MAX_TOKENS = 4

def phrase_chunk(sentences, max_tokens=4):

  chunks = []
  source_chunks = []

  for sentence_idx, sentence in enumerate(sentences):
      tokenized_sentence = tokenizer.tokenize(sentence)

      if len(tokenized_sentence) <= max_tokens:
          chunks.append(sentence)
          source_chunks.append(sentence_idx)
      else:
          start = 0
          while start < len(tokenized_sentence):
              end = min(start + max_tokens, len(tokenized_sentence))
              chunk = tokenizer.convert_tokens_to_string(tokenized_sentence[start:end])
              chunks.append(chunk)
              source_chunks.append(sentence_idx)
              start += max_tokens

  return chunks, source_chunks

phrase_chunks, source_chunks = phrase_chunk(normalized_sentences, max_tokens=4)

print(f"Number of chunks: {len(phrase_chunks)}")
print(f"Sample chunk: {phrase_chunks[0]}")

Number of chunks: 13301
Sample chunk: in the year 1878


Document Based Chunking

This chunks based on sentences or paragraphs

In [None]:
from transformers import AutoTokenizer
import re

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

MAX_TOKENS = 256

def chunk_document_by_paragraph(text, max_tokens=MAX_TOKENS):

    # Split the document into paragraphs
    paragraphs = text.split('\n\n')  # Assuming paragraphs are separated by newlines
    chunks = []
    current_chunk = []

    tokenized_paragraphs = [tokenizer.tokenize(p) for p in paragraphs]

    for paragraph, tokenized_paragraph in zip(paragraphs, tokenized_paragraphs):
        if len(tokenized_paragraph) > max_tokens:
            # Split large paragraphs
            for i in range(0, len(tokenized_paragraph), max_tokens):
                chunk = tokenizer.convert_tokens_to_string(tokenized_paragraph[i:i + max_tokens])
                chunks.append(chunk)
            continue  

        if len(tokenized_paragraph) + sum(len(tokenizer.tokenize(p)) for p in current_chunk) > max_tokens:
            if current_chunk: 
                chunks.append(' '.join(current_chunk))
            current_chunk = [paragraph]  
        else:
            current_chunk.append(paragraph)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def chunk_document_by_sentence(text, max_tokens=MAX_TOKENS):
    sentences = re.split(r'(?<=\.)\s+', text)  
    chunks = []
    current_chunk = []

    for sentence in sentences:
        tokenized_sentence = tokenizer.tokenize(sentence)

        if len(tokenized_sentence) + sum(len(tokenizer.tokenize(s)) for s in current_chunk) > max_tokens:
            if current_chunk:  
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence] 
        else:
            current_chunk.append(sentence)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

paragraph_chunks = chunk_document_by_paragraph(normalized_sentences_string, max_tokens=256)
sentence_chunks = chunk_document_by_sentence(normalized_sentences_string, max_tokens=128)

# Print the result
print(f"Paragraph-based chunks: {len(paragraph_chunks)} chunks")
print(f"Sample paragraph chunk: {paragraph_chunks[0]}")

words = word_tokenize(paragraph_chunks[0])
print(len(words))

print(f"\nSentence-based chunks: {len(sentence_chunks)} chunks")
print(f"Sample sentence chunk: {sentence_chunks[0]}")

words = word_tokenize(sentence_chunks[0])
print(len(words))


Token indices sequence length is longer than the specified maximum sequence length for this model (776 > 512). Running this sequence through the model will result in indexing errors


Paragraph-based chunks: 245 chunks
Sample paragraph chunk: in the year 1878 i took my degree of doctor of medicine of the university of london and proceeded to netley to go through the course prescribed for surgeons in the army. having completed my studies there i was duly attached to the fifth northumberland fusiliers as assistant surgeon. the regiment was stationed in india at the time and before i could join it the second afghan war had broken out. on landing at bombay i learned that my corps had advanced through the passes and was already deep in the enemys country. i followed however with many other officers who were in the same situation as myself and succeeded in reaching candahar in safety where i found my regiment and at once entered upon my new duties. the campaign brought honours and promotion to many but for me it had nothing but misfortune and disaster. i was removed from my brigade and attached to the berkshires with whom i served at the fatal battle of maiwand. there i w

Semantic Chunking

In [None]:
from sentence_transformers import SentenceTransformer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_sentence_embeddings(sentences, model=None):
    embeddings = model.encode(sentences, show_progress_bar=True)
    return embeddings  


def semantic_chunking(max_tokens=256, similarity_threshold=0.5, sentence_embeddings=None, sentences=None):
    current_chunk = [sentences[0]]
    current_chunk_embedding = sentence_embeddings[0].reshape(1, -1)
    chunks = []

    # Iterate through the sentences and group them based on semantic similarity
    for i in range(1, len(sentences)):
        current_embedding = sentence_embeddings[i].reshape(1, -1)
        similarity = cosine_similarity(current_chunk_embedding, current_embedding).mean()

        if similarity > similarity_threshold and len(tokenizer.tokenize(' '.join(current_chunk + [sentences[i]]))) <= max_tokens:
            current_chunk.append(sentences[i])
            current_chunk_embedding = np.mean([current_chunk_embedding, current_embedding], axis=0)
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentences[i]]
            current_chunk_embedding = current_embedding

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


In [22]:
my_embeddings = generate_sentence_embeddings(normalized_sentences, model=model)

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

Batches: 100%|██████████| 68/68 [00:04<00:00, 14.23it/s]


Non-continous Semantic Chunking

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def semantic_chunking_non_continuous(max_tokens=256, num_clusters=1000, sentence_embeddings=None, sentences=None):
    """
    Group sentences into non-continuous chunks based on semantic similarity using KMeans clustering.

    Args:
        max_tokens (int): Maximum number of tokens allowed per chunk.
        num_clusters (int): Number of clusters to form with KMeans.
        sentence_embeddings (np.ndarray): Array of sentence embeddings.
        sentences (list[str]): List of sentences.

    Returns:
        list[str]: List of semantic chunks.
    """
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(sentence_embeddings)

    clusters = {}
    for sentence_idx, cluster_label in enumerate(cluster_labels):
        if cluster_label not in clusters:
            clusters[cluster_label] = []
        clusters[cluster_label].append(sentence_idx)

    chunks = []
    for cluster_label, indices in clusters.items():
        cluster_sentences = [sentences[i] for i in indices]

        current_chunk = []
        for sentence in cluster_sentences:
            if len(tokenizer.tokenize(' '.join(current_chunk + [sentence]))) <= max_tokens:
                current_chunk.append(sentence)
            else:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]

        if current_chunk:
            chunks.append(' '.join(current_chunk))

    return chunks


In [None]:
semantic_chunks = semantic_chunking(similarity_threshold=0.5, sentence_embeddings=my_embeddings, sentences=normalized_sentences)

print(f"Number of chunks: {len(semantic_chunks)}")
print(f"Sample sentence chunk: {semantic_chunks[0]}")

Number of chunks: 2087
Sample sentence chunk: In the year 1878 I took my degree of Doctor of Medicine of the
University of London and proceeded to Netley to go through the course
prescribed for surgeons in the army. Having completed my studies there
I was duly attached to the Fifth Northumberland Fusiliers as Assistant
Surgeon. The regiment was stationed in India at the time and before I
could join it the second Afghan war had broken out. On landing at
Bombay I learned that my corps had advanced through the passes and
was already deep in the enemys country. I followed however with many
other officers who were in the same situation as myself and succeeded
in reaching Candahar in safety where I found my regiment and at once
entered upon my new duties.


In [None]:
semantic_chunks_non_continous = semantic_chunking_non_continuous(sentence_embeddings=my_embeddings, sentences=normalized_sentences)

print(f"Number of chunks: {len(semantic_chunks_non_continous)}")
print(f"Sample sentence chunk: {semantic_chunks_non_continous[0]}")

Number of chunks: 1012
Sample sentence chunk: In the year 1878 I took my degree of Doctor of Medicine of the
University of London and proceeded to Netley to go through the course
prescribed for surgeons in the army. Having completed my studies there
I was duly attached to the Fifth Northumberland Fusiliers as Assistant
Surgeon. Worn with pain and weak from the prolonged hardships which I had
undergone I was removed with a great train of wounded sufferers to
the base hospital at Peshawar.


Generate sentence embeddings with SBERT.
We use this model because it was designed to create rich embeddings preserving semantic similarity



In [None]:
from sentence_transformers import SentenceTransformer

def generate_sentence_embeddings(sentences, model=None):
    embeddings = model.encode(sentences, show_progress_bar=True)
    return embeddings 

Load sentence embeddings or generate your own.

SBERT will compute the embeddings very fast. Currently taking only a minute or so without a GPU.

Retrieve Context

This is the driver code of RAG context generation.
Once we compute our embeddings we use the cosine similarity to understand which sentences are close in the embedded space and retreive the top 'k' closest sentences.  We also need to embed our query for comparison.

In [27]:
def retrieve_relevant_info_tensor(query, top_k=5, embeddings=None, sentences=None, model=None):

    embeddings = torch.tensor(embeddings)
    # Compute the query embedding
    query_embedding = torch.tensor(model.encode(query)).unsqueeze(0)  # Shape: [1, embedding_dim]

    # Compute cosine similarities between query and sentence embeddings
    similarities = F.cosine_similarity(embeddings, query_embedding, dim=1)

    # Get top-k most similar sentences
    top_indices = torch.topk(similarities, top_k).indices

    # Retrieve the relevant sentences
    relevant_sentences = [sentences[idx] for idx in top_indices]
    return relevant_sentences


In [28]:
def  retrieve_relevant_info_np(query, top_k=5, embeddings=None, sentences=None, model=None):
  # Convert to NumPy arrays if not already
  embeddings = np.array(embeddings)
  query_embedding = np.array(model.encode(query)).reshape(1, -1)

  # Compute cosine similarity
  similarities = np.dot(embeddings, query_embedding.T).flatten()
  similarities /= np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_embedding)

  # Get top-k most similar sentences
  top_indices = np.argsort(-similarities)[:top_k]
  relevant_sentences = [sentences[idx] for idx in top_indices]
  return relevant_sentences

Retrieve Context - Phrase Chunking

In [29]:
def  retrieve_relevant_info_phrase_chunk(query, top_k=5, embeddings=None, sentences=None, source_chunks=None, model=None):
  # Convert to NumPy arrays if not already
  embeddings = np.array(embeddings)
  query_embedding = np.array(model.encode(query)).reshape(1, -1)

  # Compute cosine similarity
  similarities = np.dot(embeddings, query_embedding.T).flatten()
  similarities /= np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_embedding)

  # Get top-k most similar sentences
  top_indices = np.argsort(-similarities)[:top_k]
  top_indices_source_chunks = [source_chunks[idx] for idx in top_indices]
  relevant_sentences = [sentences[idx] for idx in  top_indices_source_chunks]
  return relevant_sentences

Generate RAG context

In [30]:
#We define our query here so it is consistent throughout retreival and answering.
query = "What word was written on the wall at the crime scene?"

In [31]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Or another SBERT model

chunking_type = sentence_chunks
my_embeddings = generate_sentence_embeddings(chunking_type, model=model)

Batches: 100%|██████████| 14/14 [00:03<00:00,  3.84it/s]


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

relevant_sentences_tensor = retrieve_relevant_info_tensor(query, top_k=10, embeddings=my_embeddings, sentences=chunking_type, model=model)
relevant_sentences_np = retrieve_relevant_info_np(query, top_k=10, embeddings=my_embeddings, sentences=chunking_type, model=model)
context_tensor = ' '.join(relevant_sentences_tensor)
context_np = ' '.join(relevant_sentences_np)

# print(f"Context - Tensor")
# for sentence in relevant_sentences_tensor:
#   print(sentence)

# print(f"Context - NP")
# for sentence in relevant_sentences_np:
#   print(sentence)

In [34]:
manual_dataset_path = 'data/misc/Manual_QA_Dataset.json'

with open(manual_dataset_path, 'r') as f:
    all_manual_qa_pairs = json.load(f)
print("Dataset loaded successfully!")

Dataset loaded successfully!


BERTScore - Recursive Chunking

In [None]:
from bert_score import score

model_emb = SentenceTransformer('all-MiniLM-L6-v2')
chunking_type = text_chunks
my_embeddings = generate_sentence_embeddings(chunking_type, model=model_emb)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model_qa = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')
qa_pipeline = pipeline("question-answering", model=model_qa, tokenizer=tokenizer)

model_rag = SentenceTransformer('all-MiniLM-L6-v2')

generated_ans = []
ground_truth_ans = []
generated_con = []
ground_truth_con = []

for qac in all_manual_qa_pairs:

  print("Ground truth:")
  print(f"Context: {qac['context']}")
  print(f"Question: {qac['question']}")
  print(f"Answer: {qac['answer']}")

  query = qac['question']
  ground_truth_ans.append(qac['answer'])
  ground_truth_con.append(qac['context'])

  print("Genrated:")
  relevant_sentences_tensor = retrieve_relevant_info_tensor(query, top_k=10, embeddings=my_embeddings, sentences=chunking_type, model=model_rag)
  context_tensor = ' '.join(relevant_sentences_tensor)
  print(f"Context: {context_tensor}")
  generated_con.append(context_tensor)

  answer = qa_pipeline(question=query, context=context_tensor)["answer"]
  print(f"Answer: {answer}")
  generated_ans.append(answer)

# Compute BERTScore
print("Computing BERTScore for Recursive Chunking - Context...")
P_PT, R_PT, F1_PT = score(generated_con, ground_truth_con, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")

print("Computing BERTScore for Recursive Chunking - Answer...")
P_PT, R_PT, F1_PT = score(generated_ans, ground_truth_ans, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Batches: 100%|██████████| 20/20 [00:05<00:00,  3.57it/s]
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ground truth:
Context: Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.
Question: Who introduced Watson to Holmes?
Answer: Stamford
Genrated:
Context: lawyer i suppose. his writing has a legal twist about it. here comes our man i think. as he spoke there was a sharp ring at the bell. sherlock holmes rose softly and moved his chair in the direction of the door. we heard the servant pass along the hall and the sharp click of the latch as she opened it. does dr. watson live here? asked a clear but rather harsh voice. we could not hear the servants reply but the door closed and some one began to ascend the stairs. the footfall was an uncertain an

BERTScore - Paragraph Chunking

In [36]:

model_emb = SentenceTransformer('all-MiniLM-L6-v2')  # Or another SBERT model
chunking_type = paragraph_chunks
my_embeddings = generate_sentence_embeddings(chunking_type, model=model_emb)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model_qa = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')
qa_pipeline = pipeline("question-answering", model=model_qa, tokenizer=tokenizer)

model_rag = SentenceTransformer('all-MiniLM-L6-v2')

generated_ans = []
ground_truth_ans = []
generated_con = []
ground_truth_con = []

for qac in all_manual_qa_pairs:

  print("Ground truth:")
  print(f"Context: {qac['context']}")
  print(f"Question: {qac['question']}")
  print(f"Answer: {qac['answer']}")

  query = qac['question']
  ground_truth_ans.append(qac['answer'])
  ground_truth_con.append(qac['context'])

  print("Genrated:")
  relevant_sentences_tensor = retrieve_relevant_info_tensor(query, top_k=10, embeddings=my_embeddings, sentences=chunking_type, model=model_rag)
  context_tensor = ' '.join(relevant_sentences_tensor)
  print(f"Context: {context_tensor}")
  generated_con.append(context_tensor)

  answer = qa_pipeline(question=query, context=context_tensor)["answer"]
  print(f"Answer: {answer}")
  generated_ans.append(answer)

# Compute BERTScore
print("Computing BERTScore for Paragraph Chunking - Context...")
P_PT, R_PT, F1_PT = score(generated_con, ground_truth_con, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")

print("Computing BERTScore for Paragraph Chunking - Answer...")
P_PT, R_PT, F1_PT = score(generated_ans, ground_truth_ans, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")



Batches: 100%|██████████| 8/8 [00:03<00:00,  2.07it/s]
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ground truth:
Context: Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.
Question: Who introduced Watson to Holmes?
Answer: Stamford
Genrated:
Context: as he spoke there was a sharp ring at the bell. sherlock holmes rose softly and moved his chair in the direction of the door. we heard the servant pass along the hall and the sharp click of the latch as she opened it. does dr. watson live here? asked a clear but rather harsh voice. we could not hear the servants reply but the door closed and some one began to ascend the stairs. the footfall was an uncertain and shuffling one. a look of surprise passed over the face of my companion as he listene

BERTScore - Sentence Chunking

In [None]:

model_emb = SentenceTransformer('all-MiniLM-L6-v2')
chunking_type = sentence_chunks
my_embeddings = generate_sentence_embeddings(chunking_type, model=model_emb)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model_qa = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')
qa_pipeline = pipeline("question-answering", model=model_qa, tokenizer=tokenizer)

model_rag = SentenceTransformer('all-MiniLM-L6-v2')

generated_ans = []
ground_truth_ans = []
generated_con = []
ground_truth_con = []

for qac in all_manual_qa_pairs:

  print("Ground truth:")
  print(f"Context: {qac['context']}")
  print(f"Question: {qac['question']}")
  print(f"Answer: {qac['answer']}")

  query = qac['question']
  ground_truth_ans.append(qac['answer'])
  ground_truth_con.append(qac['context'])

  print("Genrated:")
  relevant_sentences_tensor = retrieve_relevant_info_tensor(query, top_k=10, embeddings=my_embeddings, sentences=chunking_type, model=model_rag)
  context_tensor = ' '.join(relevant_sentences_tensor)
  print(f"Context: {context_tensor}")
  generated_con.append(context_tensor)

  answer = qa_pipeline(question=query, context=context_tensor)["answer"]
  print(f"Answer: {answer}")
  generated_ans.append(answer)

# Compute BERTScore
print("Computing BERTScore for Sentence Chunking - Context...")
P_PT, R_PT, F1_PT = score(generated_con, ground_truth_con, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")

print("Computing BERTScore for Sentence Chunking - Answer...")
P_PT, R_PT, F1_PT = score(generated_ans, ground_truth_ans, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")



Batches: 100%|██████████| 14/14 [00:03<00:00,  4.21it/s]
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ground truth:
Context: Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.
Question: Who introduced Watson to Holmes?
Answer: Stamford
Genrated:
Context: As he spoke there was a sharp ring at the bell. Sherlock Holmes rose
softly and moved his chair in the direction of the door. We heard the
servant pass along the hall and the sharp click of the latch as she
opened it. Does Dr. Watson live here? asked a clear but rather harsh voice. We
could not hear the servants reply but the door closed and some one
began to ascend the stairs. The footfall was an uncertain and shuffling
one. A look of surprise passed over the face of my companion as he
listene

BERTScore - Semantic Chunking

In [None]:

model_emb = SentenceTransformer('all-MiniLM-L6-v2')  
chunking_type = semantic_chunks
my_embeddings = generate_sentence_embeddings(chunking_type, model=model_emb)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model_qa = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')
qa_pipeline = pipeline("question-answering", model=model_qa, tokenizer=tokenizer)

model_rag = SentenceTransformer('all-MiniLM-L6-v2')

generated_ans = []
ground_truth_ans = []
generated_con = []
ground_truth_con = []

for qac in all_manual_qa_pairs:

  print("Ground truth:")
  print(f"Context: {qac['context']}")
  print(f"Question: {qac['question']}")
  print(f"Answer: {qac['answer']}")

  query = qac['question']
  ground_truth_ans.append(qac['answer'])
  ground_truth_con.append(qac['context'])

  print("Genrated:")
  relevant_sentences_tensor = retrieve_relevant_info_tensor(query, top_k=10, embeddings=my_embeddings, sentences=chunking_type, model=model_rag)
  context_tensor = ' '.join(relevant_sentences_tensor)
  print(f"Context: {context_tensor}")
  generated_con.append(context_tensor)

  answer = qa_pipeline(question=query, context=context_tensor)["answer"]
  print(f"Answer: {answer}")
  generated_ans.append(answer)

# Compute BERTScore
print("Computing BERTScore for Semantic Chunking - Context...")
P_PT, R_PT, F1_PT = score(generated_con, ground_truth_con, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")

print("Computing BERTScore for Semantic Chunking - Answer...")
P_PT, R_PT, F1_PT = score(generated_ans, ground_truth_ans, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")



Batches: 100%|██████████| 66/66 [00:03<00:00, 16.98it/s]
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ground truth:
Context: Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.
Question: Who introduced Watson to Holmes?
Answer: Stamford
Genrated:
Context: Precisely so answered Holmes. Watson Mr. Sherlock Holmes said Stamford introducing us. Of course Doctor Watson this is
strictly between ourselves. Whatever have you been doing with yourself Watson? he asked in
undisguised wonder as we rattled through the crowded London streets. He had evidently come with the intention of consulting with
Sherlock Holmes for on perceiving his colleague he appeared to be
embarrassed and put out. There is nothing like first hand evidence he remarked as a matter
of 

BERTScore - Non-continuous Semantic Chunking

In [39]:

model_emb = SentenceTransformer('all-MiniLM-L6-v2')  # Or another SBERT model
chunking_type = semantic_chunks_non_continous
my_embeddings = generate_sentence_embeddings(chunking_type, model=model_emb)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model_qa = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')
qa_pipeline = pipeline("question-answering", model=model_qa, tokenizer=tokenizer)

model_rag = SentenceTransformer('all-MiniLM-L6-v2')

generated_ans = []
ground_truth_ans = []
generated_con = []
ground_truth_con = []

for qac in all_manual_qa_pairs:

  print("Ground truth:")
  print(f"Context: {qac['context']}")
  print(f"Question: {qac['question']}")
  print(f"Answer: {qac['answer']}")

  query = qac['question']
  ground_truth_ans.append(qac['answer'])
  ground_truth_con.append(qac['context'])

  print("Genrated:")
  relevant_sentences_tensor = retrieve_relevant_info_tensor(query, top_k=10, embeddings=my_embeddings, sentences=chunking_type, model=model_rag)
  context_tensor = ' '.join(relevant_sentences_tensor)
  print(f"Context: {context_tensor}")
  generated_con.append(context_tensor)

  answer = qa_pipeline(question=query, context=context_tensor)["answer"]
  print(f"Answer: {answer}")
  generated_ans.append(answer)

# Compute BERTScore
print("Computing BERTScore for Non-continuous Semantic Chunking - Context...")
P_PT, R_PT, F1_PT = score(generated_con, ground_truth_con, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")

print("Computing BERTScore for Non-continuous Semantic Chunking - Answer...")
P_PT, R_PT, F1_PT = score(generated_ans, ground_truth_ans, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")



Batches: 100%|██████████| 32/32 [00:03<00:00,  8.72it/s]
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ground truth:
Context: Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.
Question: Who introduced Watson to Holmes?
Answer: Stamford
Genrated:
Context: Watson Mr. Sherlock Holmes said Stamford introducing us. We came here on business said Stamford sitting down on a high
threelegged stool and pushing another one in my direction with his
foot. He had himself in reply to a question
confirmed Stamfords opinion upon that point. H. is in Europe. There was no name appended to this
message.

And there was nothing else? Holmes asked. You
do not know Sherlock Holmes yet he said perhaps you would not care
for him as a constant companion.

Why what is the

BERTScore - Phrase Chunking

In [None]:

model_emb = SentenceTransformer('all-MiniLM-L6-v2') 
chunking_type = phrase_chunks
my_embeddings = generate_sentence_embeddings(chunking_type, model=model_emb)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model_qa = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')
qa_pipeline = pipeline("question-answering", model=model_qa, tokenizer=tokenizer)

model_rag = SentenceTransformer('all-MiniLM-L6-v2')

generated_ans = []
ground_truth_ans = []
generated_con = []
ground_truth_con = []

for qac in all_manual_qa_pairs:

  print("Ground truth:")
  print(f"Context: {qac['context']}")
  print(f"Question: {qac['question']}")
  print(f"Answer: {qac['answer']}")

  query = qac['question']
  ground_truth_ans.append(qac['answer'])
  ground_truth_con.append(qac['context'])

  print("Genrated:")
  relevant_sentences_phrase = retrieve_relevant_info_phrase_chunk(query, top_k=5, embeddings=my_embeddings, sentences=normalized_sentences, source_chunks=source_chunks, model=model_rag)
  context_phrase = ' '.join(relevant_sentences_phrase)
  print(f"Context: {context_phrase}")
  generated_con.append(context_phrase)

  answer = qa_pipeline(question=query, context=context_phrase)["answer"]
  print(f"Answer: {answer}")
  generated_ans.append(answer)

# Compute BERTScore
print("Computing BERTScore for Phrase Chunking - Context...")
P_PT, R_PT, F1_PT = score(generated_con, ground_truth_con, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")

print("Computing BERTScore for Phrase Chunking - Answer...")
P_PT, R_PT, F1_PT = score(generated_ans, ground_truth_ans, lang="en", model_type="bert-base-uncased")

# Print scores
print(f"Precision: {P_PT.mean():.4f}")
print(f"Recall: {R_PT.mean():.4f}")
print(f"F1 Score: {F1_PT.mean():.4f}")



Batches: 100%|██████████| 416/416 [00:06<00:00, 61.66it/s]
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ground truth:
Context: Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.
Question: Who introduced Watson to Holmes?
Answer: Stamford
Genrated:
Context: Watson Mr. Sherlock Holmes said Stamford introducing us. I guess you are going to take
me to the policestation he remarked to Sherlock Holmes. I will now cut one of these pills in two said Holmes and drawing
his penknife he suited the action to the word. I was the first to
discover what had occurred.

We have been hearing Gregsons view of the matter Holmes observed. Theres more work to be got out of one of those little beggars than
out of a dozen of the force Holmes remarked.
Answer: view of th