In [2]:
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, pipeline, AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, AutoModelForQuestionAnswering, BertForQuestionAnswering
from datasets import load_dataset, Dataset
import torch
import torch.nn.functional as F
import re
from nltk.tokenize import sent_tokenize
import nltk
import json
import os
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
2024-12-04 20:24:45.561989: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733361885.572613  401836 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733361885.576104  401836 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-04 20:24:45.588470: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Disable WandB
import os
os.environ["WANDB_DISABLED"] = "true"

# NLTK downloads
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# NLTK imports
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import (
    RegexpTokenizer,
    TextTilingTokenizer,
    sent_tokenize,
    word_tokenize,
)


[nltk_data] Downloading package punkt to /home/yejashi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yejashi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/yejashi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
def load_text(file_path):
    """
    Load text from a file.
    """
    print("Loading the text from the file...")
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def remove_punctuation(text):
    """
    Remove punctuation from a given text string.

    This function takes a string and removes any character that is not a
    word character (alphanumeric) or whitespace, effectively stripping out
    punctuation.

    Parameters:
        text (str): The input string from which to remove punctuation.

    Returns:
        str: The input string with punctuation removed.
    """
    return re.sub(r'[^\w\s]', '', text)

def expand_contractions(s, contractions_dict, cont_reg):
    """
    Expand contractions in a given text string using a dictionary of contractions.

    This function searches for contractions in the input string `s` using a regular
    expression pattern (`cont_reg`) and replaces each contraction with its expanded
    form from the `contractions_dict`.

    Parameters:
        s (str): The input string containing contractions to expand.
        contractions_dict (dict): A dictionary where keys are contractions (e.g., "can't")
                              and values are their expanded forms (e.g., "cannot").
        cont_reg (re.Pattern): A regular expression pattern to identify
                           contractions within the text.

    Returns:
        str: The input string with contractions expanded.

    """
    def replace(match):
        contraction = match.group(0).lower()
        if contraction in contractions_dict:
            return contractions_dict[contraction]
        return contraction  # Returns original if not in dictionary

    return cont_reg.sub(replace, s)

def remove_gutenberg_header_footer(text, start_marker, end_marker):
    """
    Remove the header and footer from Project Gutenberg text.

    Finds specified start and end markers and returns the content in between.

    Parameters:
        text (str): The text to clean.
        start_marker (str): Marker indicating the start of main content.
        end_marker (str): Marker indicating the end of main content.

    Returns:
        str: Text with header and footer removed, or original text if markers are not found.
    """
    start_index = text.find(start_marker)
    if start_index == -1:
        return text

    end_index = text.find(end_marker)
    if end_index == -1:
        return text

    cleaned_text = text[start_index + len(start_marker):end_index].strip()
    return cleaned_text


def identify_chapters(text):
    """
    Identify chapter markers in text and filter.

    Finds chapter markers (e.g., "Chapter 1", "Chapter II") and returns their positions.
    Filters out chapters if they are located too closely to one another. (Table of contents)

    Parameters:
        text (str): The text in which to identify chapter markers.

    Returns:
        list of tuples: List of chapter markers and their positions, filtered by distance.
    """
    chapter_markers = []
    roman_numeral_pattern = r"(?i)\bchapter\s+(\d+|[IVXLCDM]+)\b"

    for match in re.finditer(roman_numeral_pattern, text):
        chapter_markers.append((match.group(0), match.start()))

    # Remove chapters with start index difference less than 1000
    filtered_chapter_markers = []
    for i in range(len(chapter_markers)):
        if i + 1 < len(chapter_markers):  # Check if there's a next chapter
            current_start_index = chapter_markers[i][1]
            next_start_index = chapter_markers[i + 1][1]
            if next_start_index - current_start_index >= 1000:
                filtered_chapter_markers.append(chapter_markers[i])
        else:
            # Always include the last chapter
            filtered_chapter_markers.append(chapter_markers[i])

    return filtered_chapter_markers

def split_novel_into_chapters(text, chapter_markers):
    """
    Split text into chapters based on chapter marker positions.

    Parameters:
        text (str): The novel text to split.
        chapter_markers (list of tuples): Chapter markers with positions in the text.

    Returns:
        list of str: List of chapter texts.
    """
    chapters = []

    for i in range(len(chapter_markers)):
        chapter_num, chapter_start_index = chapter_markers[i]

        if i + 1 < len(chapter_markers):
            next_chapter_start_index = chapter_markers[i + 1][1]
            chapter_text = text[chapter_start_index:next_chapter_start_index].strip()
        else:
            chapter_text = text[chapter_start_index:].strip()

        if chapter_text:
            chapters.append(chapter_text)

    return chapters

def combine_chapters_remove_chapter_numbers(chapters):
    """
    Combine chapter texts into one and remove chapter numbers.

    Parameters:
        chapters (list of str): List of chapter texts.

    Returns:
        str: Combined text with chapter numbers removed.
    """
    combined_text = ""
    for chapter in chapters:
        chapter = re.sub(r"Chapter\s+([IVXLC]+|\d+)\b\.?\s?.*?\n", "", chapter, flags=re.IGNORECASE)
        combined_text += chapter + "\n"  # Add a newline between chapters

    return combined_text


In [5]:
start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***"

character_aliases = {
    "Sherlock Holmes": ["Sherlock Holmes", "Mr. Sherlock Holmes", "Holmes", "Mr. Holmes", "Sherlock"],
    "Dr. John Watson": ["Dr. John Watson", "Dr. Watson", "Watson", "John Watson", "Doctor", "Doctor Watson"],
    "Inspector Lestrade": ["Mr. Lestrade", "Inspector Lestrade", "Lestrade"],
    "Inspector Gregson": ["Mr. Gregson", "Inspector Gregson", "Gregson", "Tobias Gregson"],
    "Rance": ["Rance", "the constable", "John Rance"],
    "Madame Charpentier": ["Madame Chapentier", "Mrs. Chapentier"],
    "Arthur Charpentier": ["Arthur Charpentier", "Arthur", "Lieutenant Charpentier", "Young Charpentier"],
    "Alice": ["Alice"],
    "Stamford": ["Stamford"],
    "Joseph Stangerson": ["Joseph Stangerson", "Mr. Joseph Stangerson", "Stangerson", "Mr. Stangerson"],
    "Enoch Drebber": ["Enoch J. Drebber", "Enoch Drebber", "Mr. Enoch Drebber", "Drebber", "Mr. Drebber", "Brother Drebber", "E. J. Drebber"],
    "Jefferson Hope": ["Jefferson Hope", "Mr. Jefferson Hope", "Hope", "Jefferson", "J.H."],
    "Lucy Ferrier": ["Lucy Ferrier", "Lucy"],
    "John Ferrier": ["John Ferrier", "Ferrier", "Brother Ferrier"],
    "Brigham Young": ["Brigham Young"]
    # Add more characters as needed
}

character_aliases_subset = {
    "Sherlock Holmes": ["Sherlock Holmes", "Mr. Sherlock Holmes", "Holmes", "Mr. Holmes", "Sherlock"],
    "Dr. John Watson": ["Dr. John Watson", "Dr. Watson", "Watson", "John Watson", "Doctor", "Doctor Watson"],
    "Inspector Lestrade": ["Mr. Lestrade", "Inspector Lestrade", "Lestrade"],
    "Inspector Gregson": ["Mr. Gregson", "Inspector Gregson", "Gregson", "Tobias Gregson"],
    "Joseph Stangerson": ["Joseph Stangerson", "Mr. Joseph Stangerson", "Stangerson", "Mr. Stangerson"],
    "Enoch Drebber": ["Enoch J. Drebber", "Enoch Drebber", "Mr. Enoch Drebber", "Drebber", "Mr. Drebber", "Brother Drebber", "E. J. Drebber"],
    "Jefferson Hope": ["Jefferson Hope", "Mr. Jefferson Hope", "Hope", "Jefferson", "J.H."],
    "Lucy Ferrier": ["Lucy Ferrier", "Lucy"],
    "John Ferrier": ["John Ferrier", "Ferrier", "Brother Ferrier"]

}

contractions_dict = {
      "ain’t": "are not", "aren’t": "are not", "can’t": "cannot", "can’t’ve": "cannot have",
      "'cause": "because", "could’ve": "could have", "couldn’t": "could not", "couldn’t’ve": "could not have",
      "didn’t": "did not", "doesn’t": "does not", "don’t": "do not", "hadn’t": "had not",
      "hadn’t’ve": "had not have", "hasn’t": "has not", "haven’t": "have not", "he’d": "he would",
      "he’d’ve": "he would have", "he’ll": "he will", "he’ll’ve": "he will have", "how’d": "how did",
      "how’d’y": "how do you", "how’ll": "how will", "I’d": "I would", "I’d’ve": "I would have",
      "I’ll": "I will", "I’ll’ve": "I will have", "I’m": "I am", "I’ve": "I have", "isn’t": "is not",
      "it’d": "it would", "it’d’ve": "it would have", "it’ll": "it will", "it’ll’ve": "it will have",
      "let’s": "let us", "ma’am": "madam", "mayn’t": "may not", "might’ve": "might have",
      "mightn’t": "might not", "mightn’t’ve": "might not have", "must’ve": "must have", "mustn’t": "must not",
      "mustn’t’ve": "must not have", "needn’t": "need not", "needn’t’ve": "need not have", "o’clock": "of the clock",
      "oughtn’t": "ought not", "oughtn’t’ve": "ought not have", "shan’t": "shall not", "sha’n’t": "shall not",
      "shan’t’ve": "shall not have", "she’d": "she would", "she’d’ve": "she would have", "she’ll": "she will",
      "she’ll’ve": "she will have", "should’ve": "should have", "shouldn’t": "should not",
      "shouldn’t’ve": "should not have", "so’ve": "so have", "that’d": "that would", "that’d’ve": "that would have",
      "there’d": "there would", "there’d’ve": "there would have", "they’d": "they would",
      "they’d’ve": "they would have", "they’ll": "they will", "they’ll’ve": "they will have", "they’re": "they are",
      "they’ve": "they have", "to’ve": "to have", "wasn’t": "was not", "we’d": "we would",
      "we’d’ve": "we would have", "we’ll": "we will", "we’ll’ve": "we will have", "we’re": "we are",
      "we’ve": "we have", "weren’t": "were not", "what’ll": "what will", "what’ll’ve": "what will have",
      "what’re": "what are", "what’ve": "what have", "when’ve": "when have", "where’d": "where did",
      "where’ve": "where have", "who’ll": "who will", "who’ll’ve": "who will have", "who’ve": "who have",
      "why’ve": "why have", "will’ve": "will have", "won’t": "will not", "won’t’ve": "will not have",
      "would’ve": "would have", "wouldn’t": "would not", "wouldn’t’ve": "would not have", "y’all": "you all",
      "y’all’d": "you all would", "y’all’d’ve": "you all would have", "y’all’re": "you all are",
      "y’all’ve": "you all have", "you’d": "you would", "you’d’ve": "you would have", "you’ll": "you will",
      "you’ll’ve": "you will have", "you’re": "you are", "you’ve": "you have"
  }


In [6]:
book_path = "data/books/a_study_in_scarlet.txt"

novel_text = load_text(book_path)

# Remove header/footer and split into chapters
novel_text_no_hf = remove_gutenberg_header_footer(novel_text, start_marker, end_marker)
chapter_markers = identify_chapters(novel_text_no_hf)
chapters = split_novel_into_chapters(novel_text_no_hf, chapter_markers)

# Expand contractions in chapters
contractions_dict = {k.lower(): v for k, v in contractions_dict.items()}
contraction_pattern = r"\b(%s)\b" % '|'.join(map(re.escape, contractions_dict.keys()))
cont_reg = re.compile(contraction_pattern, re.IGNORECASE)

chapters = [expand_contractions(chapter, contractions_dict, cont_reg) for chapter in chapters]

# # Combine chapters into single text for analysis
single_text = combine_chapters_remove_chapter_numbers(chapters)

Loading the text from the file...


In [7]:
def tokenize_chapters(chapters):
    """
    Tokenize chapters into sentences and remove chapter titles and punctuation.

    This function processes a list of chapters by removing chapter titles,
    converting chapters into sentences, and stripping punctuation.

    Parameters:
        chapters (list): A list of chapter texts to tokenize.

    Returns:
        tuple: A tuple containing two lists:
            - A list of tokenized sentences with chapter titles and punctuation removed.
            - A list of tokenized sentences with chapter titles removed but punctuation retained.
    """
    chapters_new = chapters.copy()
    chapters_new_punct = chapters.copy()

    for idx, chapter in enumerate(chapters_new):
        # chapters_new[idx] = chapter.replace("\n", " ")
        chapters_new_punct[idx] = chapter.replace("\n", " \n ")

        # Remove chapter name
        chap_reg = r"(?i)\bchapter\s+(\d+|[IVXLCDM]+)\b"

        chapters_new[idx] = re.sub(chap_reg, "", chapters_new[idx])
        chapters_new_punct[idx] = re.sub(chap_reg, "", chapters_new_punct[idx])

        # Tokenize into sentences
        chapters_new[idx]  = sent_tokenize(chapters_new[idx])
        chapters_new_punct[idx]  = sent_tokenize(chapters_new_punct[idx])

        # Remove punctuation
        chapters_new[idx] = [remove_punctuation(sent) for sent in chapters_new[idx]]

        # Remove first two sentences (assumed to be titles or similar)
        chapters_new[idx] = chapters_new[idx][2:]
        chapters_new_punct[idx] = chapters_new_punct[idx][2:]

    return chapters_new, chapters_new_punct

In [8]:
def get_sentence_sentiments(chapters_new):
    """
    Analyze the sentiment of each sentence in the provided chapters.

    This function computes sentiment scores for each sentence in the chapters
    using the Sentiment Intensity Analyzer. It returns a list of overall sentence
    sentiments and a nested list of sentiments organized by chapter.

    Parameters:
        chapters_new (list of list of str): A nested list where each sublist contains
                                            sentences from a chapter.

    Returns:
        tuple: A tuple containing:
            - list: A list of sentiment scores for each sentence.
            - list: A nested list of sentiment scores organized by chapter.
    """
    sentence_sentiments = []
    by_chapter_sentence_sentiments = []

    sid = SentimentIntensityAnalyzer()

    c = 0
    for chapter in range(0, len(chapters_new)):
        dummy = []
        for sentence in range(0, len(chapters_new[chapter])):
            sentiment = sid.polarity_scores(chapters_new[chapter][sentence])
            sentence_sentiments.append(sentiment)
            dummy.append(sentiment)
            c += 1
        by_chapter_sentence_sentiments.append(dummy)

    return sentence_sentiments, by_chapter_sentence_sentiments

In [9]:
def calculate_sentiment(text):
    """
    Calculate the sentiment score of a given text using SentimentIntensityAnalyzer.

    Parameters:
        text (str): The text to analyze.

    Returns:
        dict: A dictionary containing sentiment scores (negative, neutral, positive, compound).
    """
    sid = SentimentIntensityAnalyzer()
    return sid.polarity_scores(text)

def _capture_scenes(character_limit, chapter, character_aliases):
    """
    Capture scenes from a chapter based on character mentions.

    Parameters:
        character_limit (int): Maximum number of unique characters allowed in a scene.
        chapter (list of str): A list of sentences from the chapter.
        character_aliases (dict): A dictionary mapping character names to their aliases.

    Returns:
        tuple: A tuple containing:
            - scenes (list of tuples): Each tuple contains the start and end indices of a scene.
            - scene_characters (list of lists): A list of characters mentioned in each scene.
    """
    visited_mentions = []
    start_scene = 0
    scene_characters = []
    scenes = []

    added_scene = False
    for sentence_idx, sentence in enumerate(chapter):
        for character, aliases in character_aliases.items():
            for alias in aliases:
                if character not in visited_mentions and re.search(r'\b' + re.escape(alias) + r'\b', sentence, re.IGNORECASE):
                    if len(visited_mentions) + 1 > character_limit:
                        scene_characters.append(visited_mentions)
                        scenes.append((start_scene, sentence_idx - 1))
                        start_scene = sentence_idx
                        visited_mentions = []
                        added_scene = True
                    visited_mentions.append(character)
        if len(visited_mentions) <= 3:
            added_scene = False

    if not added_scene and len(visited_mentions) > 0:
        scene_characters.append(visited_mentions)
        scenes.append((start_scene, start_scene + len(visited_mentions)))

    return scenes, scene_characters

def compose_scenes(chapters, character_aliases, character_limit=3):
    """
    Compose scenes from all chapters based on character mentions.

    Parameters:
        chapters (list of lists): A list of chapters, where each chapter is a list of sentences.
        character_aliases (dict): A dictionary mapping character names to their aliases.
        character_limit (int): Maximum number of unique characters allowed in a scene.

    Returns:
        list of tuples: Each tuple contains scenes and characters for each chapter.
    """
    chapter_scenes = []
    for chap_idx, chapter in enumerate(chapters):
        scenes, scene_characters = _capture_scenes(character_limit, chapter, character_aliases)
        chapter_scenes.append((scenes, scene_characters))
    return chapter_scenes

def compose_scene_characteristics(chapters_new, chapters_new_punct, chapter_scenes):
    """
    Analyze scene characteristics, including sentiment and involved characters.

    Parameters:
        chapters_new (list of lists): Tokenized chapters (without punctuation).
        chapters_new_punct (list of lists): Tokenized chapters (with punctuation).
        chapter_scenes (list of tuples): Scenes and their respective characters for each chapter.

    Returns:
        tuple: A tuple containing:
            - chapter_scene_sentiments (list of lists): Sentiment scores for each scene.
            - aggregate_scene_sentences (list of str): All sentences from the scenes.
            - aggregate_scene_sentences_punct (list of str): All sentences from the scenes with punctuation.
            - aggregate_scene_chapter_sentiments (list of float): Aggregate sentiment scores for all scenes.
            - aggregate_scene_characters (list of lists): Characters involved in all scenes.
    """
    chapter_scene_sentiments = []
    aggregate_scene_sentences = []
    aggregate_scene_sentences_punct = []

    for chapter_idx, chapter in enumerate(chapter_scenes):
        sentiments = []
        for idx, scene in enumerate(chapter[0]):
            sentences = str(chapters_new[chapter_idx][scene[0] : scene[1] + 1])

            aggregate_scene_sentences.append(" ".join(chapters_new[chapter_idx][scene[0] : scene[1] + 1]))
            sentiments.append(calculate_sentiment(sentences)['compound'])
        chapter_scene_sentiments.append(sentiments)

    for chapter_idx, chapter in enumerate(chapter_scenes):
        sentiments = []
        for idx, scene in enumerate(chapter[0]):
            sentences = str(chapters_new[chapter_idx][scene[0] : scene[1] + 1])
            aggregate_scene_sentences_punct.append(" ".join(chapters_new_punct[chapter_idx][scene[0] : scene[1] + 1]))

    aggregate_scene_chapter_sentiments = [sentiment for scene in chapter_scene_sentiments for sentiment in scene]

    aggregate_scene_characters = []

    for chapter in chapter_scenes:
        for characters in chapter[1]:
            aggregate_scene_characters.append(characters)

    return chapter_scene_sentiments, aggregate_scene_sentences, aggregate_scene_sentences_punct, aggregate_scene_chapter_sentiments, aggregate_scene_characters


In [10]:
chapters_new, chapters_new_punct = tokenize_chapters(chapters)

In [11]:
chapter_scenes = compose_scenes(chapters_new, character_aliases, character_limit=3)

In [12]:
chapter_scene_sentiments, aggregate_scene_sentences, aggregate_scene_sentences_punct, aggregate_scene_chapter_sentiments, aggregate_scene_characters = compose_scene_characteristics(chapters_new, chapters_new_punct, chapter_scenes)

Generate Sentence Embeddings with Bert

In [13]:
def generate_sentence_embeddings(sentences, model=None):
  embeddings = model.encode(sentences, show_progress_bar=True)
  return embeddings

Load sentence embeddings or generate your own

In [14]:
aggregate_scene_sentences_lower = [i.lower().replace("\n", " ") for i in aggregate_scene_sentences]

In [15]:
chunks = {}

for idx, scene_sentences in enumerate(aggregate_scene_sentences_lower):
  tmp = {
      "characters_present": aggregate_scene_characters[idx],
      "sentences": scene_sentences
  }

  chunks[idx] = tmp

In [16]:
len(chunks)

73

In [17]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

scene_sentence_embeddings = []
for scene in chunks:
  scene_sentence_embeddings.append(generate_sentence_embeddings(str(scene), model))

Batches: 100%|██████████| 1/1 [00:00<00:00, 141.84it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 238.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 185.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 190.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 196.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 224.32it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 202.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 213.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 232.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 214.31it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 185.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 201.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 203.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 200.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 205.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 213.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 198.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 191.

Working on RAG doot doot, doot doot

In [18]:
def retrieve_relevant_info(query, top_k=5, embeddings=None, sentences=None, model=None):

    embeddings = torch.tensor(embeddings)
    # Compute the query embedding
    query_embedding = torch.tensor(model.encode(query)).unsqueeze(0)  # Shape: [1, embedding_dim]

    # Compute cosine similarities between query and sentence embeddings
    similarities = F.cosine_similarity(embeddings, query_embedding, dim=1)

    # Get top-k most similar sentences
    top_indices = torch.topk(similarities, top_k).indices

    # Retrieve the relevant sentences
    relevant_sentences = [sentences[idx] for idx in top_indices]
    return relevant_sentences


Generate RAG context

In [19]:
chunks_list = []
for i in range(0,73):
  chunks_list.append(str(chunks[i]))

In [21]:
query = "Who did Jefferson Hope kill in a Study in Scarlet?"

In [23]:
model = SentenceTransformer('all-MiniLM-L6-v2')

relevant_sentences = retrieve_relevant_info(query.lower(), top_k=10, embeddings=scene_sentence_embeddings, sentences=chunks_list, model=model)
context = ' '.join(relevant_sentences)

# Uncomment to sentences or context
# for sentence in relevant_sentences:
#   print(sentence)

# print(len(context))

In [None]:
def truncate_context(question, context, tokenizer, max_length=512):
    """
    Truncates the context to fit within the model's max token limit.

    Args:
        question: The input question.
        context: The input context string.
        tokenizer: The tokenizer used to calculate token length.
        max_length: The maximum allowed length of tokens.

    Returns:
        A truncated context string.
    """
    tokens = tokenizer(question, context, truncation=True, max_length=max_length, return_tensors="pt")
    return tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)

Testing the out of box model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


answer = qa_pipeline(question=query, context=context)["answer"]
print('\n\n')
print(answer)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.





here mr sherlock holmes he said we are all ready to acknowledge
