In [18]:
import os 
from nltk.tokenize import sent_tokenize 
from typing import List, Tuple, Dict, Any, Optional, Generator
from langchain_community.utils.math import cosine_similarity
from rag.pipeline import SharedEmbeddingModel
from natsort import natsorted
import time

In [19]:
def lazy_read(file_handle, chunk_size_kb=4):
    """
    Generator that yields chunks of specified size from an open file.
    
    Args:
        file_handle: Open file object in read mode
        chunk_size_kb: Size of each chunk in KB (default: 4KB)
    
    Yields:
        str: Chunks of the file content
    """
    chunk_size_bytes = chunk_size_kb * 1024
    
    while True:
        chunk = file_handle.read(chunk_size_bytes)
        if not chunk:
            break
        yield chunk

In [20]:
with open(r"C:\Users\22bcscs055\Downloads\final_train\collection_1.tsv", "r", encoding="utf-8") as f:
    start_time = time.time()
    chunks = list(lazy_read(f, chunk_size_kb=4))
    end_time = time.time()
    print(f"Time taken to read chunks: {end_time - start_time} seconds")
# print(chunks)

Time taken to read chunks: 2.566915988922119 seconds


In [21]:
with open(r"C:\Users\22bcscs055\Downloads\final_train\collection_1.tsv", "r", encoding="utf-8") as f:
    start_time = time.time()
    chunks = f.read()
    end_time = time.time()
    print(f"Time taken to read entire file: {end_time - start_time} seconds")
# print(chunks)

Time taken to read entire file: 10.579607009887695 seconds


In [16]:
def _semantic_chunking(file_path: str, shared_model) -> Generator[Tuple[str, str, int], None, None]:
        filename = os.path.basename(file_path)
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                chunk = f.read()

                chunk_id = 0
                chunk_size = len(chunk)
                sentences = []
                sentences = sent_tokenize(chunk)
                big_sentences = []
                for i in range(len(sentences)-1):
                    if i == len(sentences)-1:
                        big_sentences.append(sentences[i])
                        break
                    if len(sentences[i]) < 20:
                        sentences[i+1] = sentences[i]+ " "+ sentences[i+1]
                    else:
                        big_sentences.append(sentences[i])

                sentences = big_sentences
                # print(sentences)
                embeddings = []
                combined_sentences = []
                combined_sentences.append(sentences[0])
                distances = [0]
                for i in range(1,len(sentences)):
                    combined_sentences.append(sentences[i-1]+sentences[i])

                # print(combined_sentences)
                for i in range(1,len(sentences)):
                    embeddings = shared_model.embed_documents(combined_sentences)
                    current = embeddings[i]
                    prev = embeddings[i-1]

                    similarity = cosine_similarity([prev],[current])[0][0]
                    # print(f"similarity between {i-1} and {i} is {similarity}")
                    distances.append(1- similarity)
                print(distances)
                breakpoint_distance_threshold = 0.25
                indices_above_thresh = [i for i,x in enumerate(distances) if x > breakpoint_distance_threshold]
                print(indices_above_thresh)
                if len(indices_above_thresh) == 0:
                    yield (chunk, filename, chunk_id)
                    return
                o=0
                for i in range(len(indices_above_thresh)):
                    chunk_to_yield = " ".join(sentences[o:indices_above_thresh[i]])
                    print(chunk_to_yield)
                    yield (chunk_to_yield, filename, chunk_id)
                    o = indices_above_thresh[i]
                    chunk_id += 1 
                if o < len(sentences):
                    chunk_to_yield = "".join(sentences[o:len(sentences)])
                    print(chunk_to_yield)
                    yield (chunk_to_yield, filename, chunk_id)
                        # closed-distances[o]...distances[i]-open o<-i
        except Exception as e:
            print(f"Error reading {filename}: {e}")

In [17]:
files = os.listdir(r"C:\Users\22bcscs055\Downloads\test_data")
files = natsorted(files)
shared_model = SharedEmbeddingModel()
shared_model.initialize_model()
for fname in files:
    file_path = os.path.join(r"C:\Users\22bcscs055\Downloads\test_data", fname)
    for chunk in _semantic_chunking(file_path, shared_model):
        print(chunk)

[0, np.float64(0.24039139445479119), np.float64(0.11138506369664014), np.float64(0.1324346497807971), np.float64(0.02349088230540608), np.float64(0.1183792297856151), np.float64(0.18378144501692406), np.float64(0.14887880619393856), np.float64(0.10192206357010147), np.float64(0.14987422778126447), np.float64(0.09096656527480274), np.float64(0.10960152377312782), np.float64(0.10160671711434865), np.float64(0.10032842519581486), np.float64(0.08796016569652432), np.float64(0.10301426162967964), np.float64(0.10354926488642935), np.float64(0.13786734753126506), np.float64(0.09504877590209915), np.float64(0.13826515385067395), np.float64(0.09125461303246873), np.float64(0.15182645366127123), np.float64(0.10055220046198798), np.float64(0.13105223472546812), np.float64(0.1401103772511516), np.float64(0.16404717680700442), np.float64(0.10477582351257242), np.float64(0.04724637976085777), np.float64(0.11474758792837736)]
[]
('"BUSINESS & INDUSTRY.  "IN 1984 THIS AMERICAN COMPANY INTRODUCED THE F

In [14]:
from typing import Generator, Tuple
import os
from rag.parse_json import parser
from rag.pipeline import lazy_read
from rag.preprocess import preprocess_chunk_text

def _special_json_chunking(file_path: str) -> Generator[Tuple[str, str, int], None, None]:
        """Specialized JSON chunking for mock_data."""
        filename = os.path.basename(file_path)
        
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            chunk_id = 0
            # Lazily create chunks for memory and time efficiency
            for chunk in lazy_read(f, chunk_size_kb=4):
                chunk = " " + preprocess_chunk_text(chunk)
                print(chunk)
                root = parser(chunk)
                print(root)
                chunklets = []
                def chunks_in(node, chunk):
                    if node.children:
                        for child in node.children:
                            chunks_in(child, chunk)
                    else:
                        chunklets.append(chunk[node.start+1:node.end])
                chunks_in(root, chunk)
                for chunk_to_yield in chunklets:
                    if len(chunk_to_yield.strip()) > 5:  # Yield only non-empty chunks
                        yield (chunk_to_yield, filename, chunk_id)
                    chunk_id += 1


In [15]:
lst = list(_special_json_chunking(file_path = r"C:\Users\22bcscs055\Downloads\mock_data\doc_21.txt"))
for l in lst:
    print(l)

 include consideration of the student's. If there were two words to describe him, they are 'He cared. Overall crime dropped from 894 reports in 1997 to 874 in 1998. A third Fellowship the Richard A. Wiebe Public Service fellowship. , UPDATE 22-11 - University at Albany }, { NDSU does not discriminate on the basis of age, color, disability, gender. he learned when he was far away from home and everyone he'd ever. father, Walter Piehl Jr. announced them. called the most honest and open-hearted. I would because it is all part of the process. Texas State Seed and Plant Board. magazine - North Dakota State University }, { May 25, 2016. I know without a doubt that I. Send inquiries and submissions to HQ AFRC/ PAOM, 155 Richard Ray Blvd. are going to get a lot healthier fleet of aircraft, he said. Air Force officials announced April 12 that Davis-. open process to address F-35 basing, said Jennifer. and then to Beale AFB in 1998. here - Citizen Airman Magazine - U.S. Air Force }, { We are als

In [33]:
from typing import Generator, Tuple
from nltk.tokenize import sent_tokenize 
from rag.pipeline import SharedEmbeddingModel
from langchain_community.utils.math import cosine_similarity

def _semantic_chunking_logic(chunk: str, filename: str, chunk_id: int) -> Generator[Tuple[str, str, int, int], None, None]:
    
    sentences = []
    sentences = [x.strip() for x in sent_tokenize(chunk)]
    big_sentences = []
    # Combine short sentences
    for i in range(len(sentences)):
        if i == len(sentences)-1:
            big_sentences.append(sentences[i])
            break
        if len(sentences[i]) < 30:
            sentences[i+1] = sentences[i]+ " "+ sentences[i+1]
        else:
            big_sentences.append(sentences[i])

    sentences = big_sentences
    print(sentences)
    embeddings = []
    combined_sentences = []
    combined_sentences.append(sentences[0])
    distances = [0]
    for i in range(1,len(sentences)):
        combined_sentences.append(sentences[i-1]+sentences[i])
    # We combine two sentences to get better context for similarity
    for i in range(1,len(sentences)):
        embeddings = shared_model.embed_documents(combined_sentences)
        current = embeddings[i]
        prev = embeddings[i-1]

        similarity = cosine_similarity([prev],[current])[0][0]
        distances.append(1- similarity)
    breakpoint_distance_threshold = 0.25 #[0.20, 0.22, 0.25, 0.28, 0.30, 0.33, 0.35, 0.38, 0.40] # Tuned for BGE embeddings but can be increased a bit
    # for j, breakpoint in enumerate(breakpoint_distance_threshold):
    print(distances)
    j = 0
    indices_above_thresh = [i for i,x in enumerate(distances) if x > breakpoint_distance_threshold]
    # No breakpoints found - yield as single chunk or split if too large
    if len(indices_above_thresh) == 0:
        yield (chunk, filename, chunk_id, j)
        return
    # Creating chunks based on detected breakpoints
    o=0
    for i in range(len(indices_above_thresh)):
        chunk_to_yield = " ".join(sentences[o:indices_above_thresh[i]])
        yield (chunk_to_yield, filename, chunk_id, j)
        chunk_id += 1

        o = indices_above_thresh[i]
        
    if o < len(sentences):
        chunk_to_yield = " ".join(sentences[o:len(sentences)])
        yield (chunk_to_yield, filename, chunk_id, j)
        

In [34]:
chunk = """
Question for written answer E-9590/2010 to the Commission Rule 117 Raül Romeva i Rueda (Verts/ALE) (23 November 2010) Subject: Arms export contract between Spain and Saudi Arabia A few days ago, in the context of a visit to Spain by Prince Khaled bin Sultan, a newspaper report was published on the negotiations between Spain and Saudi Arabia on the sale to the latter of between 200 and 270 Leopard 2E combat vehicles. This, together with the export of at least 20 armoured recovery vehicles, will add up to a contract worth over three billion euros (excluding technical training and maintenance). It is also clear that if this operation is finalised, Spain will be contravening the Council Common Position 2008/944/CFSP of 8 December 2008 on arms exports, defining common rules governing control of exports of military technology and equipment. The content of this Common Position closely follows that of the Code of Conduct which preceded it during nine presidencies before its final adoption. The Common Position outlines the eight criteria to be taken into account before authorising arms sales. These are: (1) respect for international obligations, (2) respect for human rights in the country of final destination, (3) internal situation in the country of final destination, as a function of the existence of tensions or armed conflicts, (4) preservation of regional peace, security and stability, (5) national security of the Member States, (6) behaviour of the buyer country with regard to the international community, (7) existence of a risk of diversion, and (8) compatibility of the exports of the military technology or equipment with the technical and economic capacity of the recipient country. The Common Position further stipulates that another criterion for authorisation may be the economic, social, industrial and commercial interests of the exporting country. With regard to the above, it should be noted that Saudi Arabia does not exactly meet the eight criteria listed; in fact it fails to meet several of them, and if Spain were to go ahead with this contract it would be failing to meet its Community obligations. In view of all the above: Is the Commission informed of this situation? Does the Commission consider that Spain would be failing to comply with Common Position 2008/944/CFSP on control of arms exports? If so, what means will the Commission use to demand Spain's compliance with Community legislation?"""
# for sent in sent_tokenize(chunk):
#     print(sent)
shared_model = SharedEmbeddingModel()
shared_model.initialize_model()
for chunklet in _semantic_chunking_logic(chunk, "custom text", 0):
    print(chunklet)

['Question for written answer E-9590/2010 to the Commission Rule 117 Raül Romeva i Rueda (Verts/ALE) (23 November 2010) Subject: Arms export contract between Spain and Saudi Arabia A few days ago, in the context of a visit to Spain by Prince Khaled bin Sultan, a newspaper report was published on the negotiations between Spain and Saudi Arabia on the sale to the latter of between 200 and 270 Leopard 2E combat vehicles.', 'This, together with the export of at least 20 armoured recovery vehicles, will add up to a contract worth over three billion euros (excluding technical training and maintenance).', 'It is also clear that if this operation is finalised, Spain will be contravening the Council Common Position 2008/944/CFSP of 8 December 2008 on arms exports, defining common rules governing control of exports of military technology and equipment.', 'The content of this Common Position closely follows that of the Code of Conduct which preceded it during nine presidencies before its final ad