In [None]:
# Install required packages
!pip install langchain
!pip install langchain-community
!pip install langchain-text-splitters
!pip install transformers
!pip install torch

# Now import the required modules
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np
from typing import List, Dict

class SemanticTextSplitter:
    """Split text based on semantic meaning using embeddings"""
    def __init__(self,
                 chunk_size: int = 1000,
                 chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.embeddings = HuggingFaceEmbeddings()

**Example Scenarios**

In [None]:
class SemanticTextSplitter:
   """Split text based on semantic meaning using embeddings"""
   def __init__(self,
                chunk_size: int = 1000,
                chunk_overlap: int = 200):
       self.chunk_size = chunk_size
       self.chunk_overlap = chunk_overlap
       self.embeddings = HuggingFaceEmbeddings()

   def split_by_semantic_similarity(self, text: str) -> List[str]:
       """Split text based on semantic similarity between sections"""
       # Initial split into paragraphs
       paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]

       # Get embeddings for paragraphs
       embeddings = [self.embeddings.embed_query(p) for p in paragraphs]

       # Initialize chunks
       chunks = []
       current_chunk = [paragraphs[0]]
       current_emb = embeddings[0]

       # Group similar paragraphs
       for i in range(1, len(paragraphs)):
           # Calculate similarity with current chunk
           similarity = np.dot(current_emb, embeddings[i])

           if similarity > 0.7:  # Threshold for semantic similarity
               current_chunk.append(paragraphs[i])
           else:
               # Save current chunk and start new one
               chunks.append('\n\n'.join(current_chunk))
               current_chunk = [paragraphs[i]]
               current_emb = embeddings[i]

       # Add final chunk
       if current_chunk:
           chunks.append('\n\n'.join(current_chunk))

       return chunks

def test_semantic_splitting():
   """Test semantic text splitting with example documents"""

   # Test document with distinct topics
   test_doc = """
   Machine Learning Fundamentals
   Machine learning models learn from data patterns. They can identify
   complex relationships and make predictions. Training requires large
   datasets and computational resources.

   Climate Change Effects
   Global temperatures are rising at unprecedented rates. This causes
   melting ice caps and rising sea levels. Many species are at risk
   due to habitat changes.

   Space Exploration Progress
   Recent Mars missions have provided new insights. Private companies
   are now leading space innovation. The search for extraterrestrial
   life continues.
   """

   splitter = SemanticTextSplitter()
   chunks = splitter.split_by_semantic_similarity(test_doc)

   print("Semantic Splitting Results:")
   for i, chunk in enumerate(chunks, 1):
       print(f"\nChunk {i}:")
       print("-" * 50)
       print(chunk)
       print("-" * 50)

# Advanced usage example
def process_technical_document():
   """Process a technical document with mixed content"""

   technical_doc = """
   Neural Network Architecture
   Neural networks consist of layers of neurons. Each neuron processes
   inputs through an activation function. The network learns by adjusting
   connection weights.

   Training Process
   Training involves forward propagation of data and backpropagation
   of errors. The network adjusts weights to minimize loss. Regular
   validation prevents overfitting.

   Applications in Computer Vision
   Computer vision uses neural networks for image recognition.
   Convolutional layers extract features automatically. This enables
   accurate object detection and classification.
   """

   splitter = SemanticTextSplitter(chunk_size=500)
   chunks = splitter.split_by_semantic_similarity(technical_doc)

   print("\nTechnical Document Processing:")
   for i, chunk in enumerate(chunks, 1):
       print(f"\nSection {i}:")
       print("-" * 50)
       print(chunk)
       print("-" * 50)

if __name__ == "__main__":
   test_semantic_splitting()
   process_technical_document()