In [None]:
%%capture
!pip install llama-index==0.10.25 llama-index-embeddings-fastembed qdrant-client llama-index-vector-stores-qdrant llama-index-llms-cohere

In [None]:
import os
import sys
from getpass import getpass
import nest_asyncio

from IPython.display import Markdown, display

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv("../.env")

sys.path.append('../helpers')

from utils import setup_llm, setup_embed_model, setup_vector_store

In [None]:
CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")

In [None]:
QDRANT_URL = os.environ['QDRANT_URL'] or getpass("Enter your Qdrant URL:")

In [None]:
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")

In [None]:
from llama_index.core.settings import Settings
from utils import setup_llm, setup_embed_model

setup_llm(api_key=CO_API_KEY)

setup_embed_model(provider="fastembed")

In [None]:
from utils import get_documents_from_docstore

senpai_documents = get_documents_from_docstore("../data/words-of-the-senpais")

In [None]:
from datasets import load_dataset

eval_dataset = load_dataset("harpreetsahota/LI_Learning_RAG_Eval_Set", split='train')

eval_dataset = eval_dataset.filter(lambda x: x['question_groundedness_score'] is not None and x['question_groundedness_score'] >= 4)

In [None]:
print(documents[42].text)

# 🪟`SentenceWindowNodeParser`

The `SentenceWindowNodeParser` is unique in that it focuses on individual sentences while also capturing the surrounding context.  This is particularly useful for tasks where understanding the broader context of a sentence is useful.

### How it Works

1. **Sentence Splitting:** 

    *   Similar to `SentenceSplitter`, it first divides the document into individual sentences using a sentence tokenizer (defaults to [`PunktSentenceTokenizer`](https://www.nltk.org/api/nltk.tokenize.PunktSentenceTokenizer.html) from the `nltk` library).

2. **Window Creation:**
    *   For each sentence (node), it gathers a "window" of surrounding sentences based on the specified `window_size`. 

    *   This window is stored in the node's metadata under the `window_metadata_key`.

3. **Metadata Management:**

    *   The original sentence text is also stored in the metadata under `original_text_metadata_key`.

    *   Importantly, both the window and original text are excluded from being seen by the embedding model and LLM.

### Arguments you need to know

*   **`window_size`**: Controls the number of sentences to include before and after the central sentence in the window.

*   **`window_metadata_key`**: The key used to store the window text in the node's metadata.

*   **`original_text_metadata_key`**: The key used to store the original sentence text in the metadata.

### Usage Example

```python
from llama_index.core.node_parser import SentenceWindowNodeParser

parser = SentenceWindowNodeParser(window_size=2)

nodes = parser.get_nodes_from_documents(documents)
```

### When to Use SentenceWindowNodeParser

*   **Tasks requiring sentence-level understanding with context:** 
    *   Question answering, summarization, or sentiment analysis where the surrounding sentences provide valuable context.

*   **Fine-grained control over embedding scope:** 
    *   Creating embeddings that focus on the specific meaning of a sentence within its local context.
    
*   **Combining with MetadataReplacementNodePostProcessor:**
    *   Replacing the original sentence with its surrounding window before sending it to the LLM, allowing the model to consider the broader context.


In [None]:
documents[42]

In [None]:
from llama_index.core.node_parser import SentenceWindowNodeParser

SentenceWindowNodeParser(window_size=2).build_window_nodes_from_documents([documents[42]])

In [None]:
SentenceWindowNodeParser(window_size=3).get_nodes_from_documents([documents[42]])

In [None]:
def sentence_window_splitter(window, documents):
    splitter = SentenceSplitter(
        chunk_size=chunk_size,
        chunk_overlap=16,
        )
    nodes = splitter.get_nodes_from_documents(documents)
    return nodes