In [1]:
# import sys
# sys.path.append('C:/Users/muhammad.ehsan/hr_automation/Lib/site-packages')
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

def sentence_level_chunking(text):
    sentences = sent_tokenize(text)  # Tokenize the text into sentences
    return sentences

text = "The defendant was charged with robbery. The crime took place in 2023. The defense claims the defendant was not present."
chunks = sentence_level_chunking(text)
print(chunks)


['The defendant was charged with robbery.', 'The crime took place in 2023.', 'The defense claims the defendant was not present.']


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ehsan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
def paragraph_level_chunking(text):
    paragraphs = text.split("\n\n")  # Split text by empty lines representing paragraphs
    return paragraphs

text = """The defendant was charged with robbery and assault.
The crime took place on July 15, 2023, at the victim’s residence.

The prosecution argues that the defendant was seen fleeing the scene.
The defense claims that the defendant was not present."""
chunks = paragraph_level_chunking(text)
print(chunks)


['The defendant was charged with robbery and assault.\nThe crime took place on July 15, 2023, at the victim’s residence.', 'The prosecution argues that the defendant was seen fleeing the scene.\nThe defense claims that the defendant was not present.']


In [3]:
def topic_based_chunking(text, topics):
    topic_chunks = []
    for topic in topics:
        topic_chunks.append([sentence for sentence in text.split('.') if topic in sentence])
    return topic_chunks

text = "The defendant was charged with robbery. The crime took place in 2023. The defense claims the defendant was not present. Witnesses saw the defendant fleeing."
topics = ["robbery", "witnesses"]
chunks = topic_based_chunking(text, topics)
print(chunks)


[['The defendant was charged with robbery'], []]


In [4]:
def fixed_size_chunking(text, chunk_size=50):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

text = "The defendant was charged with robbery and assault. The crime took place in 2023 at the victim’s residence. Witnesses saw the defendant fleeing."
chunks = fixed_size_chunking(text, chunk_size=10)
print(chunks)


['The defendant was charged with robbery and assault. The crime', 'took place in 2023 at the victim’s residence. Witnesses saw', 'the defendant fleeing.']


In [5]:
def context_aware_chunking(text, threshold=50):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk + sentence) < threshold:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

text = "The defendant was charged with robbery. The crime took place in 2023. The defense claims the defendant was not present. Witnesses saw the defendant fleeing."
chunks = context_aware_chunking(text, threshold=100)
print(chunks)


['The defendant was charged with robbery. The crime took place in 2023.', 'The defense claims the defendant was not present.Witnesses saw the defendant fleeing.']


In [6]:
def hybrid_chunking(text, chunk_size=50):
    sentences = sent_tokenize(text)
    all_sentences = ' '.join(sentences)  # Sentence-level chunking first
    words = all_sentences.split()  # Fixed-size chunking on top
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

text = "The defendant was charged with robbery. The crime took place in 2023. Witnesses saw the defendant fleeing. The defense claims the defendant was not present."
chunks = hybrid_chunking(text, chunk_size=10)
print(chunks)


['The defendant was charged with robbery. The crime took place', 'in 2023. Witnesses saw the defendant fleeing. The defense claims', 'the defendant was not present.']


In [7]:
chunks

['The defendant was charged with robbery. The crime took place',
 'in 2023. Witnesses saw the defendant fleeing. The defense claims',
 'the defendant was not present.']

In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.tokenize import sent_tokenize

# Load the pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to calculate cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Context-aware chunking using embeddings
def context_aware_embedding_chunking(text, similarity_threshold=0.7, max_chunk_size=100):
    sentences = sent_tokenize(text)
    embeddings = model.encode(sentences)
    
    chunks = []
    current_chunk = []
    current_chunk_embedding = np.zeros_like(embeddings[0])
    
    for i, sentence in enumerate(sentences):
        current_embedding = embeddings[i]
        
        if len(current_chunk) == 0:
            current_chunk.append(sentence)
            current_chunk_embedding = current_embedding
            continue
        
        similarity = cosine_similarity(current_chunk_embedding, current_embedding)
        
        if similarity > similarity_threshold and len(current_chunk) < max_chunk_size:
            current_chunk.append(sentence)
            current_chunk_embedding = np.mean(model.encode(current_chunk), axis=0)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_chunk_embedding = current_embedding
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

# Example text
text = """The defendant was charged with robbery. The crime took place in 2023.
The defense claims the defendant was not present at the crime scene. The prosecution has witnesses that saw the defendant leaving the location of the crime. 
The court proceedings started in early 2023. Sentences can vary based on various factors, such as the severity of the crime and criminal history."""
chunks = context_aware_embedding_chunking(text)
for chunk in chunks:
    print(chunk)


  from tqdm.autonotebook import tqdm, trange


The defendant was charged with robbery.
The crime took place in 2023.
The defense claims the defendant was not present at the crime scene.
The prosecution has witnesses that saw the defendant leaving the location of the crime.
The court proceedings started in early 2023.
Sentences can vary based on various factors, such as the severity of the crime and criminal history.


In [10]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.4 kB ? eta -:--:--
     ----------------------------------- -- 41.0/44.4 kB 991.0 kB/s eta 0:00:01
     -------------------------------------- 44.4/44.4 kB 437.6 kB/s eta 0:00:00
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.4.5-cp311-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.20.1-cp311-none-win_amd64.whl.metadata (6.9 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.tokenize import sent_tokenize

# Load the pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to calculate cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Context-aware chunking using embeddings
def context_aware_embedding_chunking(text, similarity_threshold=0.7, max_chunk_size=100):
    sentences = sent_tokenize(text)
    embeddings = model.encode(sentences)
    
    chunks = []
    current_chunk = []
    current_chunk_embedding = np.zeros_like(embeddings[0])
    
    for i, sentence in enumerate(sentences):
        current_embedding = embeddings[i]
        
        if len(current_chunk) == 0:
            current_chunk.append(sentence)
            current_chunk_embedding = current_embedding
            continue
        
        similarity = cosine_similarity(current_chunk_embedding, current_embedding)
        
        if similarity > similarity_threshold and len(current_chunk) < max_chunk_size:
            current_chunk.append(sentence)
            current_chunk_embedding = np.mean(model.encode(current_chunk), axis=0)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_chunk_embedding = current_embedding
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

# Example text
text = """The defendant was charged with robbery. The crime took place in 2023.
The defense claims the defendant was not present at the crime scene. The prosecution has witnesses that saw the defendant leaving the location of the crime. 
The court proceedings started in early 2023. Sentences can vary based on various factors, such as the severity of the crime and criminal history."""
chunks = context_aware_embedding_chunking(text)
for chunk in chunks:
    print(chunk)


  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

The defendant was charged with robbery.
The crime took place in 2023.
The defense claims the defendant was not present at the crime scene.
The prosecution has witnesses that saw the defendant leaving the location of the crime.
The court proceedings started in early 2023.
Sentences can vary based on various factors, such as the severity of the crime and criminal history.


In [12]:
from sklearn.cluster import KMeans

def topic_based_embedding_chunking(text, n_topics=3):
    sentences = sent_tokenize(text)
    embeddings = model.encode(sentences)
    
    # Clustering sentences based on embeddings
    clustering_model = KMeans(n_clusters=n_topics)
    clustering_model.fit(embeddings)
    
    cluster_labels = clustering_model.labels_
    
    topic_chunks = {}
    for i, label in enumerate(cluster_labels):
        if label not in topic_chunks:
            topic_chunks[label] = []
        topic_chunks[label].append(sentences[i])
    
    return [" ".join(chunk) for chunk in topic_chunks.values()]

# Example text
text = """The defendant was charged with robbery. The crime took place in 2023.
The defense claims the defendant was not present. Sentencing can vary based on factors. Prosecution has witness testimony."""
chunks = topic_based_embedding_chunking(text)
for chunk in chunks:
    print(chunk)


  super()._check_params_vs_input(X, default_n_init=10)


The defendant was charged with robbery. The defense claims the defendant was not present. Prosecution has witness testimony.
The crime took place in 2023.
Sentencing can vary based on factors.
