In [18]:
import os 
from nltk.tokenize import sent_tokenize 
from typing import List, Tuple, Dict, Any, Optional, Generator
from langchain_community.utils.math import cosine_similarity
from rag.pipeline import SharedEmbeddingModel
from natsort import natsorted
import time

In [19]:
def lazy_read(file_handle, chunk_size_kb=4):
    """
    Generator that yields chunks of specified size from an open file.
    
    Args:
        file_handle: Open file object in read mode
        chunk_size_kb: Size of each chunk in KB (default: 4KB)
    
    Yields:
        str: Chunks of the file content
    """
    chunk_size_bytes = chunk_size_kb * 1024
    
    while True:
        chunk = file_handle.read(chunk_size_bytes)
        if not chunk:
            break
        yield chunk

In [20]:
with open(r"C:\Users\22bcscs055\Downloads\final_train\collection_1.tsv", "r", encoding="utf-8") as f:
    start_time = time.time()
    chunks = list(lazy_read(f, chunk_size_kb=4))
    end_time = time.time()
    print(f"Time taken to read chunks: {end_time - start_time} seconds")
# print(chunks)

Time taken to read chunks: 2.566915988922119 seconds


In [21]:
with open(r"C:\Users\22bcscs055\Downloads\final_train\collection_1.tsv", "r", encoding="utf-8") as f:
    start_time = time.time()
    chunks = f.read()
    end_time = time.time()
    print(f"Time taken to read entire file: {end_time - start_time} seconds")
# print(chunks)

Time taken to read entire file: 10.579607009887695 seconds


In [16]:
def _semantic_chunking(file_path: str, shared_model) -> Generator[Tuple[str, str, int], None, None]:
        filename = os.path.basename(file_path)
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                chunk = f.read()

                chunk_id = 0
                chunk_size = len(chunk)
                sentences = []
                sentences = sent_tokenize(chunk)
                big_sentences = []
                for i in range(len(sentences)-1):
                    if i == len(sentences)-1:
                        big_sentences.append(sentences[i])
                        break
                    if len(sentences[i]) < 20:
                        sentences[i+1] = sentences[i]+ " "+ sentences[i+1]
                    else:
                        big_sentences.append(sentences[i])

                sentences = big_sentences
                # print(sentences)
                embeddings = []
                combined_sentences = []
                combined_sentences.append(sentences[0])
                distances = [0]
                for i in range(1,len(sentences)):
                    combined_sentences.append(sentences[i-1]+sentences[i])

                # print(combined_sentences)
                for i in range(1,len(sentences)):
                    embeddings = shared_model.embed_documents(combined_sentences)
                    current = embeddings[i]
                    prev = embeddings[i-1]

                    similarity = cosine_similarity([prev],[current])[0][0]
                    # print(f"similarity between {i-1} and {i} is {similarity}")
                    distances.append(1- similarity)
                print(distances)
                breakpoint_distance_threshold = 0.25
                indices_above_thresh = [i for i,x in enumerate(distances) if x > breakpoint_distance_threshold]
                print(indices_above_thresh)
                if len(indices_above_thresh) == 0:
                    yield (chunk, filename, chunk_id)
                    return
                o=0
                for i in range(len(indices_above_thresh)):
                    chunk_to_yield = " ".join(sentences[o:indices_above_thresh[i]])
                    print(chunk_to_yield)
                    yield (chunk_to_yield, filename, chunk_id)
                    o = indices_above_thresh[i]
                    chunk_id += 1 
                if o < len(sentences):
                    chunk_to_yield = "".join(sentences[o:len(sentences)])
                    print(chunk_to_yield)
                    yield (chunk_to_yield, filename, chunk_id)
                        # closed-distances[o]...distances[i]-open o<-i
        except Exception as e:
            print(f"Error reading {filename}: {e}")

In [17]:
files = os.listdir(r"C:\Users\22bcscs055\Downloads\test_data")
files = natsorted(files)
shared_model = SharedEmbeddingModel()
shared_model.initialize_model()
for fname in files:
    file_path = os.path.join(r"C:\Users\22bcscs055\Downloads\test_data", fname)
    for chunk in _semantic_chunking(file_path, shared_model):
        print(chunk)

[0, np.float64(0.24039139445479119), np.float64(0.11138506369664014), np.float64(0.1324346497807971), np.float64(0.02349088230540608), np.float64(0.1183792297856151), np.float64(0.18378144501692406), np.float64(0.14887880619393856), np.float64(0.10192206357010147), np.float64(0.14987422778126447), np.float64(0.09096656527480274), np.float64(0.10960152377312782), np.float64(0.10160671711434865), np.float64(0.10032842519581486), np.float64(0.08796016569652432), np.float64(0.10301426162967964), np.float64(0.10354926488642935), np.float64(0.13786734753126506), np.float64(0.09504877590209915), np.float64(0.13826515385067395), np.float64(0.09125461303246873), np.float64(0.15182645366127123), np.float64(0.10055220046198798), np.float64(0.13105223472546812), np.float64(0.1401103772511516), np.float64(0.16404717680700442), np.float64(0.10477582351257242), np.float64(0.04724637976085777), np.float64(0.11474758792837736)]
[]
('"BUSINESS & INDUSTRY.  "IN 1984 THIS AMERICAN COMPANY INTRODUCED THE F

In [14]:
from typing import Generator, Tuple
import os
from rag.parse_json import parser
from rag.pipeline import lazy_read
from rag.preprocess import preprocess_chunk_text

def _special_json_chunking(file_path: str) -> Generator[Tuple[str, str, int], None, None]:
        """Specialized JSON chunking for mock_data."""
        filename = os.path.basename(file_path)
        
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            chunk_id = 0
            # Lazily create chunks for memory and time efficiency
            for chunk in lazy_read(f, chunk_size_kb=4):
                chunk = " " + preprocess_chunk_text(chunk)
                print(chunk)
                root = parser(chunk)
                print(root)
                chunklets = []
                def chunks_in(node, chunk):
                    if node.children:
                        for child in node.children:
                            chunks_in(child, chunk)
                    else:
                        chunklets.append(chunk[node.start+1:node.end])
                chunks_in(root, chunk)
                for chunk_to_yield in chunklets:
                    if len(chunk_to_yield.strip()) > 5:  # Yield only non-empty chunks
                        yield (chunk_to_yield, filename, chunk_id)
                    chunk_id += 1


In [15]:
lst = list(_special_json_chunking(file_path = r"C:\Users\22bcscs055\Downloads\mock_data\doc_21.txt"))
for l in lst:
    print(l)

 include consideration of the student's. If there were two words to describe him, they are 'He cared. Overall crime dropped from 894 reports in 1997 to 874 in 1998. A third Fellowship the Richard A. Wiebe Public Service fellowship. , UPDATE 22-11 - University at Albany }, { NDSU does not discriminate on the basis of age, color, disability, gender. he learned when he was far away from home and everyone he'd ever. father, Walter Piehl Jr. announced them. called the most honest and open-hearted. I would because it is all part of the process. Texas State Seed and Plant Board. magazine - North Dakota State University }, { May 25, 2016. I know without a doubt that I. Send inquiries and submissions to HQ AFRC/ PAOM, 155 Richard Ray Blvd. are going to get a lot healthier fleet of aircraft, he said. Air Force officials announced April 12 that Davis-. open process to address F-35 basing, said Jennifer. and then to Beale AFB in 1998. here - Citizen Airman Magazine - U.S. Air Force }, { We are als

In [11]:
from typing import Generator, Tuple
from nltk.tokenize import sent_tokenize 
from rag.pipeline import SharedEmbeddingModel
from langchain_community.utils.math import cosine_similarity

def _semantic_chunking_logic(chunk: str, filename: str, chunk_id: int) -> Generator[Tuple[str, str, int, int], None, None]:
    
    sentences = []
    sentences = [x.strip() for x in sent_tokenize(chunk)]
    big_sentences = []
    # Combine short sentences
    for i in range(len(sentences)):
        if i == len(sentences)-1:
            big_sentences.append(sentences[i])
            break
        if len(sentences[i]) < 30:
            sentences[i+1] = sentences[i]+ " "+ sentences[i+1]
        else:
            big_sentences.append(sentences[i])

    sentences = big_sentences
    print(sentences)
    embeddings = []
    combined_sentences = []
    combined_sentences.append(sentences[0])
    distances = [0]
    for i in range(1,len(sentences)):
        combined_sentences.append(sentences[i-1]+sentences[i])
    # We combine two sentences to get better context for similarity
    for i in range(1,len(sentences)):
        embeddings = shared_model.embed_documents(combined_sentences)
        current = embeddings[i]
        prev = embeddings[i-1]

        similarity = cosine_similarity([prev],[current])[0][0]
        distances.append(1- similarity)
    breakpoint_distance_threshold = 0.25 #[0.20, 0.22, 0.25, 0.28, 0.30, 0.33, 0.35, 0.38, 0.40] # Tuned for BGE embeddings but can be increased a bit
    # for j, breakpoint in enumerate(breakpoint_distance_threshold):
    print(distances)
    j = 0
    indices_above_thresh = [i for i,x in enumerate(distances) if x > breakpoint_distance_threshold]
    # No breakpoints found - yield as single chunk or split if too large
    if len(indices_above_thresh) == 0:
        yield (chunk, filename, chunk_id, j)
        return
    # Creating chunks based on detected breakpoints
    o=0
    for i in range(len(indices_above_thresh)):
        chunk_to_yield = " ".join(sentences[o:indices_above_thresh[i]])
        yield (chunk_to_yield, filename, chunk_id, j)
        chunk_id += 1

        o = indices_above_thresh[i]
        
    if o < len(sentences):
        chunk_to_yield = " ".join(sentences[o:len(sentences)])
        yield (chunk_to_yield, filename, chunk_id, j)
        

In [None]:
from nltk.tokenize import sent_tokenize
from rag.pipeline import SharedEmbeddingModel

chunk = """
to 80% of the states governed by the opposition, M. whereas measures such as arbitrary confiscation and expropriation, involving more than 760 enterprises since 2005, some of them affecting EU interests, undermine the basic social and economic rights of citizens, N. having regard to the tense political situation in Venezuela, reflected in the harassment, threats, intimidation and political and criminal persecution directed at the democratic opposition, its representatives, its democratically elected mayors and governors, the student movement, members of the army and the judiciary, opponents of Chavez official policy, journalists and the media, which has led to the imprisonment of many of them for political reasons, 1. Deplores the attacks on the independence of the judiciary; voices its concern at the arrest of Judge Afiuni and considers it a violation of her basic personal rights and a very serious threat to the independence of the judiciary, which is the basic pillar of the rule of law; 2. Calls for her release and calls on the Venezuelan Government to be committed to the values of the rule of law, facilitating a fair and rapid trial, with all the necessary legal guarantees; 3. Expresses its concern at the conditions of detention of Judge Afiuni, which pose a threat to her physical and psychological integrity, and calls on the prison authorities strictly and immediately to apply the measures and recommendations advocated by the Inter-American Commission on Human Rights on 11 January 2010 regarding Ms Afiuni's conditions of detention, 4. Condemns the public statements made by the President of the Republic of Venezuela, insulting and denigrating the judge, demanding a maximum sentence and requesting a modification of the law to enable a more severe penalty to be imposed; considers that these statements are aggravating the circumstances of her detention and constitute an attack on the independence of the judiciary by the President of a nation, who should be its first guarantor; 5. Reminds the Government of the Bolivarian Republic of Venezuela of its obligation to respect freedom of expression and opinion and freedom of the press and to respect the independence of the judiciary as it is bound to do under its own Constitution and under the different international and regional conventions and charters to which Venezuela is a signatory; believes that the Venezuelan media should guarantee pluralistic coverage of Venezuelan political and social life; 6. Calls on the Vice-President of the Commission/High Representativ
"""
# for sent in sent_tokenize(chunk):
#     print(sent)
shared_model = SharedEmbeddingModel()
shared_model.initialize_model()
vectors = []
for chunklet in _semantic_chunking_logic(chunk, "custom text", 0):
    print(chunklet)
    vectors.append(shared_model.embed_documents([chunklet[0]])[0])


['to 80% of the states governed by the opposition, M. whereas measures such as arbitrary confiscation and expropriation, involving more than 760 enterprises since 2005, some of them affecting EU interests, undermine the basic social and economic rights of citizens, N. having regard to the tense political situation in Venezuela, reflected in the harassment, threats, intimidation and political and criminal persecution directed at the democratic opposition, its representatives, its democratically elected mayors and governors, the student movement, members of the army and the judiciary, opponents of Chavez official policy, journalists and the media, which has led to the imprisonment of many of them for political reasons, 1.', 'Deplores the attacks on the independence of the judiciary; voices its concern at the arrest of Judge Afiuni and considers it a violation of her basic personal rights and a very serious threat to the independence of the judiciary, which is the basic pillar of the rule

In [14]:
vec1 = [-0.03493355,-0.03170713,0.0042408663,0.025273018,0.06460026,0.0026172758,0.01295634,0.036503367,0.06684426,-0.031468004,0.039963163,-0.031400863,-0.02711173,0.0527382,0.011220825,0.06758879,0.040128242,0.015204559,-0.0313304,-0.00081508444,-0.008433926,0.0068474733,-0.007215298,0.0020545796,0.039347675,0.0285139,0.004439084,-0.052994404,-0.090077944,0.0066867643,0.086428255,0.01720144,0.012364687,0.034443498,0.024906864,0.055980522,-0.051386464,0.06553714,0.0047654137,-0.041526087,-0.05272709,-0.0437886,-0.038065743,0.012675302,-0.045532525,-0.0045356383,-0.07318548,0.025185583,-0.048642024,0.014091414,-0.013138795,0.010590372,0.044765253,0.00342331,-0.01053821,0.03225846,0.03540326,-0.005112022,0.03129769,-0.07065139,-0.007504459,-0.03283381,0.04372177,0.008766063,0.029467454,-0.020088047,0.01076435,0.025320435,-0.016020846,-0.039944433,-0.01042873,-0.054026112,-0.0039750296,0.01225227,0.03411172,-0.013248687,-0.035523914,0.040520728,0.026909808,0.016230788,-0.010356634,-0.0007371954,-0.04110713,0.017404128,-0.086318485,-0.05828952,0.03818761,-0.04837213,-0.05180846,0.059608333,-0.030624662,-0.040258992,0.039670788,0.03171448,0.026901312,-0.035201713,0.044061337,0.012717895,0.011481009,0.010909796,-0.034928214,-0.007896107,-0.042887412,-0.007205615,-0.05314195,0.025570158,0.021591598,-0.042984813,-0.05059027,-0.045191355,-0.03639832,-0.0010866785,-0.039625823,0.0500334,-0.04285816,0.10826418,0.0120527325,-0.041054964,-0.004515466,-0.045218173,0.013884772,0.024670703,-0.0349321,0.0788277,0.02276815,0.025072604,-0.029525818,0.02502647,-0.027928498,-0.06005826,0.044237293,0.039189402,-0.045903288,-0.030895771,0.030575048,0.030769525,0.014021471,-0.007144155,0.038797636,-0.0057932897,-0.025080942,-0.013009519,-0.021106368,-0.026724452,0.077707,-0.03573131,0.017920256,-0.064230755,-0.011273753,0.039720204,-0.051532626,0.07114925,0.05814114,-0.0014653972,0.050753973,0.05670581,0.038788557,-0.057223484,0.006669591,-0.0040938337,0.027923534,0.03294129,0.05002227,0.024784211,0.007328759,0.0011197395,0.032083057,-0.032994986,-0.043405067,-0.009043483,-0.0630401,-0.023929643,0.045151487,-0.036953524,0.05533341,0.031657618,0.05090245,0.01028254,0.0048452253,0.02047846,-0.071725145,0.06993957,0.057661273,-0.036514703,0.006639827,-0.032793358,0.05732231,0.026024008,-0.0006757326,-0.0031000322,-0.06418778,-0.055125304,-0.022177188,0.0021912227,0.021094177,-0.02877005,-0.015119383,0.041614868,-0.009986402,0.0043341373,-0.0005051352,-0.031896587,-0.04508609,-0.044389218,-0.054332763,0.01446526,0.06109297,-0.023030618,-0.025225041,0.0132092945,0.0068314206,0.00701269,0.03976795,-0.013677374,0.051145695,-0.009715874,0.06518081,-0.047799524,-0.012903163,0.004430456,-0.008579756,-0.042889047,-0.010248604,0.0043698866,-0.01598232,0.083528854,0.029970717,-0.04480447,-0.058451492,0.023122368,0.0044078203,0.0060370374,0.013769131,-0.03424771,0.0076308423,-0.055146985,0.0078716315,-0.00005708275,0.058439456,-0.04656249,0.0416802,0.05141191,-0.04892885,0.0060635456,0.06502993,-0.03502476,-0.041911736,0.0072914716,-0.07118887,0.01462201,-0.0045701438,0.013106809,-0.0016700479,0.002087471,-0.016097726,-0.019893363,-0.01755428,0.0553985,0.040509094,0.014635931,-0.00031091628,0.05533115,0.005555098,0.02338224,-0.0145906815,-0.03329454,-0.04853973,0.009570602,-0.02484783,-0.012272663,0.032864824,-0.02699849,-0.044564743,-0.0047299396,0.03281665,0.010932126,-0.018114673,0.023104016,-0.0014874332,0.009699476,0.04039421,0.050925996,-0.030733645,-0.045072295,0.028014578,-0.071785994,-0.057844758,0.008281009,-0.008398281,0.030934604,-0.00089311897,0.05020985,-0.032203346,0.02020344,0.0419498,0.029909259,0.005073436,0.042094756,-0.027254768,0.042356186,0.0153774135,0.009852484,0.037684213,-0.03868431,0.027637767,-0.034424905,0.045348216,-0.06854544,-0.25435466,0.00039232662,-0.036938395,-0.009282678,0.046225436,0.036737528,-0.011165128,-0.022160754,-0.01880124,-0.009999511,0.006375659,-0.022826165,0.028305497,-0.004575156,0.030251285,0.00689404,-0.025953509,-0.027877705,-0.007550679,-0.0006783285,0.037125718,-0.07597611,-0.0009593983,0.06609528,0.034172777,0.027040679,-0.0060670925,0.008624863,-0.021815011,-0.010099134,-0.0012259372,0.009753398,0.017281998,-0.012634838,-0.03192159,-0.024348065,0.01552331,-0.0012578635,-0.015944144,-0.07305983,-0.016716737,0.0026827073,-0.030947326,-0.034177512,0.01845732,0.011909045,-0.048078343,-0.028453797,-0.013634606,0.06969685,0.049613025,-0.019770443,-0.04762482,-0.011578881,-0.0352286,-0.036099732,-0.042101104,0.000984414,-0.046108883,0.026737459,-0.007887374,0.019724416,-0.009732857,-0.039127834,0.03784331,-0.042570516,0.004446163,-0.018963348,0.059762787,-0.019034173,-0.03922239,-0.006018826,-0.020570371,-0.10668828,-0.009973899,-0.020321505,-0.0044628363,0.000041833537,-0.027158149,0.0006024007,0.0022564733,0.0063136164,0.046336684,-0.004527398,0.021786476,-0.03122514,0.013102416,0.00883681,0.008746777,-0.009859353,0.005802157,-0.0074510993,-0.0016336724,-0.017241651,0.000107256456,0.017156318,-0.02680852,-0.028930413,0.0057024523,0.014771908,0.022923287,-0.0148822395,-0.0053123953,-0.051177837,-0.032100618,-0.07453765,-0.046693165,-0.02213038,0.03989354,0.022602899,0.010667379,-0.006974967,0.017809944,-0.031472966,-0.002172687,-0.0049274857,-0.0083284825,0.036648188,-0.012360493,-0.0116069615,0.0076486277,0.0052579874,-0.037024308,-0.04164114,-0.036684945,-0.011206062,-0.039144535,0.032081615,0.0037366482,0.09281584,0.024817767,-0.00015231235,-0.031800255,-0.029404895,0.026720226,-0.038142845,-0.02539704,-0.035689734,-0.019914247,0.017737431,0.008550863,0.010197721,0.016844368,0.027051907,0.007976302,0.02127581,0.024425259,0.023414364,0.006286801,0.018554252,-0.052087314,0.0122942375,0.024326794,-0.0077464976,0.023840062,-0.06613844,0.026673373,0.0034250978,0.015068469,-0.021921534,-0.05082944,-0.04300819,-0.021758309,-0.0052316654,0.056819748,0.04119912,-0.019348823,0.013783091,-0.045834508,0.04676446,0.06314017,-0.02428748,-0.022121234,0.03485168,-0.0405835,0.022311548,0.017074998,0.0125179095,0.042233422,0.01676256,0.020332772,0.00080962764,-0.045397446,0.027374936,0.056754146,0.015942235,-0.059888437,-0.034145255,-0.071660616,0.026568223,-0.0115131205,0.002620211,-0.021504587,0.06564387,-0.026759963,-0.076675124,-0.009929996,0.005747424,0.019101365,0.01339283,-0.026799392,0.04061366,-0.025296478,0.019672444,0.012576457,-0.059817474,-0.03958925,-0.027377399,-0.03681045,0.064405076,-0.0012337377,0.027534718,0.038077433,-0.016496431,-0.022055445,-0.048935533,0.040471807,-0.06522967,-0.03206342,-0.0024785448,0.062025998,-0.040062103,0.017703153,-0.005671032,-0.01399935,0.017739587,0.018675197,-0.021208638,-0.004845069,-0.022919638,0.029687162,-0.03692026,-0.0063601234,0.00476166,0.0069983085,0.035306275,0.004234952,0.0055940943,0.014007377,0.01824883,0.019876838,0.03276765,0.014836434,-0.011601812,0.010233057,0.008908962,0.026149718,0.028423328,0.030819215,0.07491529,-0.0013131546,0.033044748,-0.0282633,0.028295368,0.009812248,-0.023361674,-0.04230488,-0.016611496,-0.09192546,0.0019635337,-0.0015541762,0.04958404,0.006559557,-0.0044615013,0.069722064,-0.013906464,0.021075254,-0.022640858,0.0028450056,-0.038118932,-0.017705275,-0.048414186,0.007008369,0.02201033,0.031240074,0.019733617,0.015864003,-0.046099227,0.02331527,-0.028652506,-0.037391733,0.046462987,-0.033249166,-0.027277198,-0.005963817,-0.027922489,-0.065454416,0.015065099,-0.033092424,-0.016299829,-0.02194169,0.024435032,0.0031540808,0.045808684,-0.07440172,-0.008605192,0.046253826,-0.02704465,-0.01638483,0.021611499,-0.04988886,0.03967281,0.06574676,0.015364234,-0.0013830112,0.019085744,-0.010431011,-0.0137249045,0.039372325,-0.054585632,-0.016197303,-0.06733937,0.07055816,0.020259256,-0.042582296,-0.007146896,-0.017613837,0.0023248156,0.00674314,-0.0061870227,0.00047897996,0.046593934,0.013326988,0.0074838353,0.04540874,0.040414244,-0.03000809,0.027573407,0.028578758,0.02848858,0.048091937,-0.016213644,-0.024363717,0.07455237,-0.021948377,-0.047466803,-0.011432133,-0.025851667,-0.036775004,0.07784832,0.027399948,0.008013616,0.05812524,0.0442529,0.00046081885,0.012552724,0.025486227,0.02504835,0.03522261,-0.022620067,0.023957444,0.030301124,-0.09580215,-0.0066615394,-0.015301479,0.0021639226,-0.008581584,0.03296313,0.017975258,0.0063765775,-0.047200587,-0.035459496,0.05976195,0.0068569663,-0.01440611,-0.009866935,-0.0039320723,-0.03811712,0.041071907,0.021290911,0.0062631755,0.0012375322,0.03399067,0.031173289,0.017625399,0.008584894,0.08344515,-0.008153757,0.026696896,0.06806065,-0.027257334,-0.079746254,-0.02904971,0.0049318704,-0.03009126,-0.07775205,-0.0060963817,0.026884353,0.0139449695,0.005807256,0.0055641755,-0.038809467,0.017688168,0.050811477,-0.04107422,0.0016768811,-0.0025203938,0.024648238,-0.0345196,0.02935336,0.026250994,-0.008696308,-0.036035072,0.021502309,0.021105142,0.008626213,-0.038077917,-0.057255,-0.073730946,0.069548264,-0.007735049,0.009454004,-0.054826044,-0.019166673,-0.03288988,-0.029985879,0.08343207,0.021524545,-0.000112120346,-0.011941452,-0.044194955,0.03144053,-0.023285618,0.00035590454,-0.045063663,0.017940564,0.023245146,0.03299221,-0.018763732,0.018112822,-0.021081297,-0.004175624,0.028786326,0.00825628,-0.055946883,-0.042712662,0.0045718765,0.010449087,-0.061416592,-0.031121064,0.026457453,0.036754727,0.08069602,-0.015462223,0.05099674,-0.024698822,-0.013905681,-0.042237084,-0.02726886,0.042703435,0.05313466,0.00891975,-0.008964738,-0.042417917,0.064776674,0.035274345,-0.034589443,0.0075628767,0.03765009,-0.053791463]
shared_model = SharedEmbeddingModel()
shared_model.initialize_model()
vec2 = shared_model.embed_documents(["According to the text, what was Richard Scrushy directed to do?"])[0]
# dis = []
# for vec in vectors:
#     dis.append(1- cosine_similarity([vec1],[vec2])[0][0])
similarity = cosine_similarity([vec1],[vec2])[0][0]
print(1- similarity)

0.4794751936502176


In [6]:

chunk = """
{ "BUSINESS & INDUSTRY", [{ " "IN 1984 THIS AMERICAN COMPANY INTRODUCED THE FIRST MINIVAN, CHRYSLER. INTUIT'S. THE GREEN GIANT WORKS FOR THIS CO. WITH ANOTHER JOLLY SPOKESFIGURE, THE DOUGHBOY, PILLSBURY. STARBUCKS. Free Flashcards about BUSINESS & INDUSTRY - StudyStack"}, { " "Sep 10, 2012. The Green Giant works for this company with another jolly spokesfigure, the Doughboy. Reply. Report Abuse Judge it! Alex Trebek. Ask Me A Trivia Question -- 2 -- - Discussion on Topix"}, { " "Mar 29, 1999. More than any other issue, the ethics of tobacco advertising -- both morally. In 1964, the company revived the cowboy but this time he was in mythical. The spokesfigure helped make McDonald's the most dominant. Jolly Green Giant. The Green Giant's national ad debut in 1928 was disappointing. TOP 10 ADVERTISING ICONS | News - AdAge"}, { " "1965: Poppin' Fresh, the Pillsbury Doughboy, makes his first appearance. Green Giant and other frozen-food companies gave Pillsbury a much larger share. 1965 by the Leo Burnett advertising agency--and Jolly Green Giant familiar figures. Other mergers in the works at this time included ties between Unilever and. History of The Pillsbury Company FundingUniverse"}, { " "It was 1925 when the Minnesota Valley Canning Company developed the seeds for a. The Green Giant became so recognizable that in 1950 Minnesota Valley. General Mills operates the Green Giant business in Europe and certain other. General Mills: Green Giant vegetables"}, { " "Feb 24, 2000. Hartwig, business team leader for Green Giant frozen vegetables, says her. Elizabeth Hanlin, who works in corporate communications, has had similar. The voice of the Jolly Green Giant tells what it's like to provide the Ho Ho Ho!. Jolly Green Giant and another company icon, the Pillsbury Doughboy,. Promoting Pillsbury: Vegetable giant renews campaign for value. "}, { " "The Jolly Green Giant is a mascot created by the Minnesota Valley Canning Company of Le Sueur, Minnesota, for its Green Giant brand of products now owned. Minnesota by Design Jolly Green Giant - Walker Art Center"}, { " "Mar 6, 2008. Green Giant Vegetables: The Green Giant. Nothing jolly
"""
# from rag.parse_json import parser
# with open(r"C:\Users\22bcscs055\Downloads\test_data_half_processed\doc_0.txt", "r", encoding="utf-8") as f:
#     chunk = f.read()
#     print(chunk)
chunk = " " + chunk
root = parser(chunk)    
root.print_tree()          
def chunks_in(node, chunk):
    if node.children:
        lst = []
        for child in node.children:
            lst.extend(chunks_in(child, chunk))
        return lst
    else:
        part = chunk[node.start+1:node.end]
        print(part)
        return [part]
chunklets = chunks_in(root, chunk)
print(chunklets)


0	2160
   |-- 2	2160
      |-- 27	2160
         |-- 27	28
         |-- 28	272
         |-- 272	275
         |-- 275	478
         |-- 478	481
         |-- 481	833
         |-- 833	836
         |-- 836	1203
         |-- 1203	1206
         |-- 1206	1477
         |-- 1477	1480
         |-- 1480	1864
         |-- 1864	1867
         |-- 1867	2085
         |-- 2085	2088
         |-- 2088	2160

 " "IN 1984 THIS AMERICAN COMPANY INTRODUCED THE FIRST MINIVAN, CHRYSLER. INTUIT'S. THE GREEN GIANT WORKS FOR THIS CO. WITH ANOTHER JOLLY SPOKESFIGURE, THE DOUGHBOY, PILLSBURY. STARBUCKS. Free Flashcards about BUSINESS & INDUSTRY - StudyStack"
, 
 " "Sep 10, 2012. The Green Giant works for this company with another jolly spokesfigure, the Doughboy. Reply. Report Abuse Judge it! Alex Trebek. Ask Me A Trivia Question -- 2 -- - Discussion on Topix"
, 
 " "Mar 29, 1999. More than any other issue, the ethics of tobacco advertising -- both morally. In 1964, the company revived the cowboy but this time he was 