In [None]:
# Install necessary libraries
! pip install llama-index-embeddings-openai
! pip install llama-index-readers-file


In [None]:
# Import required libraries
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SemanticSplitterNodeParser, SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
import os

# Set OpenAI API key and initialize embedding model
os.environ["OPENAI_API_KEY"] = "......"
embed_model = OpenAIEmbedding()

# Initialize splitters
splitter = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model)
base_splitter = SentenceSplitter(chunk_size=512)

# Load documents
documents = SimpleDirectoryReader(input_files=["C:/Users/admin/OneDrive/Desktop/Chunking_Embedding/Dataset/simple.html"]).load_data()

# Apply semantic chunking
nodes = splitter.get_nodes_from_documents(documents)

# Inspect chunks
print("Chunk 1: IBM 1401\n", nodes[1].get_content(), "\n")
print("Chunk 2: Personal Computer + College\n", nodes[2].get_content(), "\n")

In [None]:
# Setup query engine
from llama_index.core import VectorStoreIndex
from llama_index.core.response.notebook_utils import display_source_node

# Create index for semantic chunks and baseline chunks
vector_index = VectorStoreIndex(nodes)
query_engine = vector_index.as_query_engine()
base_nodes = base_splitter.split_documents(documents)
base_vector_index = VectorStoreIndex(base_nodes)
base_query_engine = base_vector_index.as_query_engine()

# Run query
response = query_engine.query("Explain some formula's?")
print("Response:\n", str(response), "\n")

# Display source nodes
for n in response.source_nodes:
    display_source_node(n, source_length=20000)
