In [6]:
import os
from sentence_transformers import SentenceTransformer, util
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Combine lines into multi-line chunks
def chunk_lines(lines, window_size=5):
    chunks = []
    indices = []
    for i in range(len(lines) - window_size + 1):
        chunk = lines[i:i + window_size]
        chunks.append(" ".join(chunk))
        indices.append((i, i + window_size))  # start, end index
    return chunks, indices

In [10]:
# Read text files and prepare chunks
def read_text_files_from_directory(directory_path, chunk_size=5):
    chapters = []
    original_lines = {}  # for retrieving original text

    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            chapter_name = os.path.splitext(filename)[0]
            filepath = os.path.join(directory_path, filename)

            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
            except UnicodeDecodeError:
                with open(filepath, 'r', encoding='latin-1') as f:
                    lines = f.readlines()

            clean_lines = [line.strip() for line in lines if line.strip()]
            original_lines[chapter_name] = clean_lines
            line_chunks, chunk_indices = chunk_lines(clean_lines, window_size=chunk_size)
            chapters.append((chapter_name, line_chunks, chunk_indices))

    return chapters, original_lines

In [3]:
# Encode all chapter chunks
def encode_chapters(model, chapters):
    encoded_chapters = []
    for chapter_name, chunks, indices in chapters:
        embeddings = model.encode(chunks, convert_to_tensor=True)
        encoded_chapters.append((chapter_name, chunks, indices, embeddings))
    return encoded_chapters

In [4]:
# Find best match and highlight best line
def find_best_match(query, encoded_chapters, original_lines, model):
    query_embedding = model.encode(query, convert_to_tensor=True)

    best_score = 0
    best_chunk = ""
    best_line = ""
    best_chapter = ""
    best_lines_range = (0, 0)

    for chapter_name, chunks, indices, embeddings in encoded_chapters:
        cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
        best_score_idx = torch.argmax(cosine_scores).item()
        score = cosine_scores[best_score_idx].item()

        if score > best_score:
            best_score = score
            best_chunk = chunks[best_score_idx]
            best_lines_range = indices[best_score_idx]
            best_chapter = chapter_name

    # Now, find best matching individual line within the best chunk
    start, end = best_lines_range
    lines_in_chunk = original_lines[best_chapter][start:end]
    line_embeddings = model.encode(lines_in_chunk, convert_to_tensor=True)
    line_scores = util.pytorch_cos_sim(query_embedding, line_embeddings)[0]
    best_line_idx = torch.argmax(line_scores).item()
    best_line = lines_in_chunk[best_line_idx]

    return lines_in_chunk, best_line, best_chapter, best_score

In [11]:

directory_path = "chapters"
model = SentenceTransformer('all-MiniLM-L6-v2')

chapters, original_lines = read_text_files_from_directory(directory_path, chunk_size=3)
encoded_chapters = encode_chapters(model, chapters)

    

In [12]:
# Main
if __name__ == "__main__":
    query = input("Enter your query: ")
    lines_in_chunk, best_line, best_chapter, score = find_best_match(query, encoded_chapters, original_lines, model)

    print(f"\n📖 Best match from: {best_chapter}")
    print("🧩 Context:")
    for line in lines_in_chunk:
        if line == best_line:
            print(f">>> {line}  <<<")  # Highlight the best line
        else:
            print(line)
    print(f"\n🔍 Similarity: {score * 100:.2f}%")


📖 Best match from: J. K. Rowling - Harry Potter 1 - Sorcerer's Stone
🧩 Context:
>>> his head. As he measured, he said, "Every Ollivander wand has a core of  <<<
a powerful magical substance, Mr. Potter. We use unicorn hairs, phoenix
tail feathers, and the heartstrings of dragons. No two Ollivander wands

🔍 Similarity: 63.47%
