# Semantic Chunking
In this notebook I will try an semantic chunking startegy developed by Greg. [Link](https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/8a30b5710b3dd99ef2239fb60c7b54bc38d3613d/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb)

In [10]:
with open('../fiqa_dataset/corpus.txt') as file:
    corpus = file.read()


In [11]:
import re

# Splitting the essay on '.', '?', and '!'
single_sentences_list = re.split(r'(?<=[.?!])\s+', corpus)
print (f"{len(single_sentences_list)} senteneces were found")


399187 senteneces were found


In [None]:
# num_remove_items = len(single_sentences_list) * 0.30

Create dictionary to store additional information later

In [12]:
sentences = [{'sentence': x, 'index' : i} for i, x in enumerate(single_sentences_list)]
sentences[:3]

[{'sentence': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that.",
  'index': 0},
 {'sentence': "Training workers is not their job - they're building software.",
  'index': 1},
 {'sentence': 'Perhaps educational systems in the U.S.', 'index': 2}]

Now we want to combine the senteces with the one before and after. Therefor we use butter_size=1

In [13]:
def combine_sentences(sentences, buffer_size=1):
    # Go through each sentence dict
    for i in range(len(sentences)):

        # Create a string that will hold the sentences which are joined
        combined_sentence = ''

        # Add sentences before the current one, based on the buffer size.
        for j in range(i - buffer_size, i):
            # Check if the index j is not negative (to avoid index out of range like on the first one)
            if j >= 0:
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += sentences[j]['sentence'] + ' '

        # Add the current sentence
        combined_sentence += sentences[i]['sentence']

        # Add sentences after the current one, based on the buffer size
        for j in range(i + 1, i + 1 + buffer_size):
            # Check if the index j is within the range of the sentences list
            if j < len(sentences):
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += ' ' + sentences[j]['sentence']

        # Then add the whole thing to your dict
        # Store the combined sentence in the current sentence dict
        sentences[i]['combined_sentence'] = combined_sentence

    return sentences

sentences = combine_sentences(sentences)

In [14]:
sentences[:3]

[{'sentence': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that.",
  'index': 0,
  'combined_sentence': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software."},
 {'sentence': "Training workers is not their job - they're building software.",
  'index': 1,
  'combined_sentence': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software. Perhaps educational systems in the U.S."},
 {'sentence': 'Perhaps educational systems in the U.S.',
  'index': 2,
  'combined_sentence': "Training workers is not their job - they're building software. Perhaps educational systems in the U.S. (or their students) should worry a little about getting marketable skills in exchange for their massive investment in education

We want to create embeddings for each batch (3 sentences) and add them to the corresponding dictionary index.

In [15]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import logger

load_dotenv()


True

In [18]:
oaiembeds = OpenAIEmbeddings(model='text-embedding-3-small', api_key=os.environ.get('OPENAI_API_KEY'))
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Embed all sentences
def embed_all_sentences(sentences):
    embeddings = []
    for i, sentence in enumerate(sentences):
        logger.warning(f'Creating embedding for sentence {i}')
        embedding = oaiembeds.embed_query(sentence['combined_sentence'])
        embeddings.append(embedding)
    return np.array(embeddings)

def calculate_cosine_distances_optimized(sentences):
    logger.warning('Embedding all sentences')
    embeddings = embed_all_sentences(sentences)
    
    logger.warning('Calculating cosine distances between consecutive sentences')
    # Calculate cosine distances between consecutive embeddings
    distances = cosine_distances(embeddings[:-1], embeddings[1:])
    
    distances_flat = distances.diagonal()  # Extract the diagonal which represents consecutive distances

    for i in range(len(sentences) - 1):
        distance = distances_flat[i]
        logger.warning(f'Cosine distance between sentence {i} and {i+1} is {distance}')
        sentences[i]['distance_to_next'] = distance

    return distances_flat, sentences

# Assuming 'sentences' is defined
distances, sentences = calculate_cosine_distances_optimized(sentences)



KeyboardInterrupt: 

In [16]:
print(len(sentences[3]['combined_sentence']) )

462


In [10]:

oaiembeds = OpenAIEmbeddings(model='text-embedding-3-small', api_key=os.environ.get('OPENAI_API_KEY'))

embeddings = oaiembeds.embed_documents(sentences[3]['combined_sentence'])

# for i, sentence in enumerate(sentences):
#     sentence['combined_sentence_embedding'] = embeddings[i]

In [26]:
print(embeddings[0])

[0.04412008573227445, 0.020497117766661594, 0.0001921859129006022, 0.005087648195471535, 0.021669312056929617, 0.0035633888164110956, -0.008465846306691982, 0.03116082860043677, -0.03513326221933786, -0.08745870984613369, 0.008677493093002038, 0.012088252570651086, -0.04014764838808296, 0.0017013096262286117, 0.014237274815260732, -0.01717590030046132, 0.027220954019657788, -0.021311140285844108, 0.017794559277685234, 0.05818641349020035, 0.0045585326266802985, 0.018266691407488492, -0.018478338193798547, -0.030411926020143656, -0.010110174589408468, -0.017810838796746283, -0.002812451994727298, 0.008905419398373141, 0.033472654417560324, -0.007529719478309487, 0.01854345999533315, -0.02697674633258043, -0.021425104369852264, -0.023899736553457517, 0.03241442607394565, -0.005295224170693727, -0.03848704058630543, 0.04014764838808296, -0.03163296073024003, 0.04548764356450108, -0.022499615957818384, 0.0027717508687681723, -0.03304936457023062, -0.004835300748863654, 0.06154019185716793,

Next, we calculate the cosine distances between sequential embedding pairs